FFTCPU0.c

From wiki
Jump to navigation Jump to search
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.
/*
 ============================================================================
 Name        : FFTCPU0.c
 Author      : Michael
 Version     :
 Copyright   : Copyright OCE Technology
 Description : Multicore FFT

 This program demonstrates the use of 1, 2, or 4 CPUs on a S698PM
 multi-core processor to calculate a Fast Fourier Transform (FFT).

 A selection of different input arrays is provided,
 additional input arrays can be created easily.

 At present the input array length must be a power of 2,
 and the scalar type must be C float.

 The functions used are defined in fft.h and documented there.
 Please consult fft.h for more information

 ============================================================================
 */
#include <stdio.h>
#include <stdlib.h>
#include "fft.h"

/*
 * main()
 *
 * Usage:
 *
 * 		Assign values to each of the variables below
 *
 * 			lenP2	 		the power of 2 that gives the length of the input array
 *							(at present from 5 to 17, i.e. length 32 to 131,072)
 *
 * 			whichcpus		the CPUs to use, from 1 to 0xf (binary xxxx, 1 = use)
 *
 * 			numruns			number of test runs - timings are given for total runs
 *
 * 			type 			0 for forward FFT, 1 for inverse FFT
 *
 * 			normalise		0 for not normalized, 1 for normalized
 *
 * 			arrange			0 if data already arranged, 1 if need to arrange
 *
 *    		realonly		0 for (in-phase, quadrature) input, 1 for (in-phase, all 0);
 *
 * 			displayLines	number of lines of data to display
 *
 * 		Set up the complex data input in cin.
 * 		(a number of choices are already set up and commented out, it is simple to add more)
 *
 *		Compile the program.
 *
 *		Use the DMON script provided to load and run the program
 *		This also sets up the CPU stacks and enables the L2 cache.
 *
 *		N.B. L2 cache is disabled by default,
 *			 if not switched on timings are affected dramatically.
 *
 */
int main(int argc,char ** argv)
{
	/*
	 * Initial entry point for all CPUs  -- DO NOT CHANGE
	 */
	if(START_CPU_INDEX != fftCpusStart())		// N.B. only START_CPU returns from this call
	{
		printf("\nAbandoning program, startup problem\n");
		return 0;
	}	// if

	/*
	 * START HERE	- select input size, CPUs to use, number of test runs
	 */
	int lenP2 = 12;					// input array length is this power of 2 (from 5 to 17 at present)

	int whichcpus = 0xf;			// CPUs to use, hex value xxxx, x = 1 to turn on, 0xf for all on

    const int numruns = 200;		// number of test runs

    const int type = 0;				// 0 for forward FFT, 1 for inverse FFT

    const int normalise = 0;		// 0 for not normalized, 1 for normalized

    const int arrange = 1;			// 0 for don't arrange, 1 for arrange

    const int realonly = 0;			// 0 for (in-phase, quadrature), 1 for (in-phase,all 0);

    int displayLines = 64;			// number of lines of the data to display, if enabled below

    // check input and correct if possible, give message and exit if settings incorrect
    if(settingsIncorrect(lenP2, &whichcpus, numruns, type, &displayLines)) return 0;

	/*
	 *  VARIOUS EXAMPLES
	 *  Example lengths depend on lenP2 above
	 *  The examples not in use are commented out
	 *  Other input choices can be set up as (in-phase, quadrature)in cin as desired
	 *  N.B. If realonly is not selected both components should be set up,
	 *  	 otherwise previous data may cause anomalous results.
	 *  	 If realonly is selected, the quadrature part of the input is overwritten
	 *  	 with 0s.
	 */
	fft_cpx * cin = (void *)FFT_CIN;	// start of complex number input data array
	int length = 1 << lenP2;			// length of complex numbers input array

    int i;

	// test: combination of frequencies and phases
	//for (i=0;i<length;++i) {
//		cin[i].re = 1.0 + sin(M_PI*i/(length >> 1))			// in-phase part
	//				    + 3*cos(M_PI*2*i/(length >> 1))
	//				    + 2*cos(M_PI*6*i/(length >> 1))
	//					+ 7*cos(M_PI*((length >> 1) - 1)*i/(length >> 1))
	//				    + 5*cos(M_PI*i);					// Nyquist
	//    cin[i].im = sin(M_PI*3*i/(length >> 1));			// quadrature part
//	}	// for

	// test: ramp
//	for (i=0;i<length;++i) {
//		cin[i].re = i;
//	    cin[i].im = 0;
//	}	// for

	// test: random
//	srand(time(0));
//	for (i=0;i<length;++i) {
//      cin[i].re = rand();
//      cin[i].im = 0;
//  }	// for
// square wave
	for (i=0;i<(length >> 1);++i) {
      cin[i].re = 3;
      cin[i].im = 0;
  }	// for

  	for (i=(length >> 1); i < length;++i) {
      cin[i].re = 0;
      cin[i].im = 0;
  }	// for
  
  
	// show the input data
//	printf("Complex data:\n");
//	showC(cin, displayLines, 0);

	/*
	 * Do FFT test
	 * Does numruns, first with one processor, then with the requested processors (whichcpus)
	 */
	// set up the twiddles array - its length is half that of the input array.
	fft_cpx * tw = (void *)FFT_TWIDDLES;	// preassigned space in SRAM starting at this address
	fillTwiddles(tw,lenP2,type);			// 0 for forward FFT, 1 for inverse FFT

	// if data is in-phase only, i.e. no quadrature component, pack into first half
	if(realonly)
	{
		fftRealOnlyArrange(cin,lenP2);
		lenP2 -= 1;							// half the size, but leave value of length as is
	}	// if

	// if need to arrange, set up mapping array - N.B. it is its own inverse
	if(arrange)
	{
		fftArrangeIndex((void *)FFT_MAP, lenP2);
	}	// if

	// set up a pointer for rearranged input and clear it
	// if 'arrange' set result will be here
	fft_cpx * rain = (void *)FFT_RAIN; 		// preassigned space in SRAM starting at this address
	for(i = 0; i < length; i++)
	{
		rain[i].re = 0;
		rain[i].im = 0;
	}

//	int map[1 << lenP2];
//	fftArrangeIndex(map, lenP2);
//	for (i=0;i<length;++i) {
//		rain[map[i]] = cin[i];
//	}	// for

	// start the cpus being used other than this one - they will halt waiting for work
	startCpus(whichcpus);

	// get ready to time
	clock_t start, finish;					// start and finish times
	unsigned long single, multiple;			// times with single, multiple C{Us

	printf("\n\nDoing %d runs of %d length FFT with 1 processor\n", numruns, length);

	start = clock();
	for(i = 0; i < numruns; i++)
	{
		if(arrange)							// will do rearrangement in parallel
		{
			// do it using this CPU, the startup CPU, usually CPU 0
			fftMulti(cin, lenP2, tw, 1<<START_CPU_INDEX, normalise, arrange, realonly);
		} else {

			// first, rearrange the input into rain
			fftArrangeOut(cin, rain,lenP2);

			// do it using this CPU, the startup CPU, usually CPU 0
			fftMulti(rain, lenP2, tw, 1<<START_CPU_INDEX, normalise, arrange, realonly);
		}	// else
	}	// for

	finish = clock();
	single = finish - start;

	printf("\n\nDone %d runs of %d length FFT with 1 processor, time: %ld\n", numruns, length, single);

	int numcpus = getNumCpus(whichcpus);	// how many CPUs being used

	printf("\nDoing %d runs of %d length FFT with %d processors\n", numruns, length, numcpus);

	start = clock();
	for(i = 0; i < numruns; i++)
	{

		if(arrange)						// will do rearrangement in parallel
		{
			// do it using selected cpus
			fftMulti(cin, lenP2, tw, whichcpus, normalise, arrange, realonly);
		} else {
			// first, rearrange the input
			fftArrangeOut(cin, rain, lenP2);
			fftMulti(rain, lenP2, tw, whichcpus, normalise, arrange, realonly);
		}	// else
	}	// for

	finish = clock();
	multiple = finish - start;

	/*
	 * FINISH - show FFT result and times
	 */
	printf("Complex FFT:\n");
	show8C(rain, length, 0.001, realonly);

	printf("\n\n%d runs of %d length FFT with %d processor, time: %ld\n", numruns, length, 1, single);
	printf("\n%d runs of %d length FFT with %d processors, time: %ld\n", numruns, length, numcpus, multiple);
	if(realonly)
	{
		printf("\nSpeedup factor for %d real data with %d processors: %2.2f\n", length, numcpus, ((float)single)/(float)multiple);
	} else {
		printf("\nSpeedup factor for %d (in-phase,quadrature) with %d processors: %2.2f\n", length, numcpus, ((float)single)/(float)multiple);
	}	// else

	return 0;

}	// main()


// see fft.h for description
static inline unsigned int get_asr17(void)
{
  unsigned int reg;
  __asm__ (" mov %%asr17, %0 " : "=r"(reg) :);
  return reg;
}


// see fft.h for description
int fftCpusStart(void)
{
	volatile unsigned int status = 0;
	volatile fft_cpu *thisCpu;

	// find out which CPU is running
	int LEON3_Cpu_Index = (get_asr17() >> 28) & 3;		// max 4 cpus, 0 .. 3

	// set up reference to data for this CPU
	thisCpu = (void *)(FFT_FLAGS + LEON3_Cpu_Index*sizeof(fft_cpu));

	// if start CPU (usually CPU0), initialize flags area
	if (START_CPU_INDEX == LEON3_Cpu_Index){
		// START_CPU (usually 0) initializes settings and returns
		volatile fft_cpu *thisCpu;
		int i = 0;
		for(i = 0; i < MAX_CPUS; i++)
		{
			thisCpu = (void *)(FFT_FLAGS + i*sizeof(fft_cpu));

			thisCpu->status = 0;
			thisCpu->cinadrs = 0;
			thisCpu->lenP2 = 0;
			thisCpu->begin = 0;
			thisCpu->nextbegin = 0;
			thisCpu->oddadd = 0;
			thisCpu->startP2 = 0;
			thisCpu->twadrs = 0;
			thisCpu->toffset = 0;
			thisCpu->tinc = 0;
			thisCpu->normalise = 0;
			thisCpu->arrange = 0;
			thisCpu->realonly = 0;
			thisCpu->spare1 = 0x01010101;
			thisCpu->spare2 = 0x10101010;
		}	// for

	}else {
		// other CPUS loop forever, check for work, do it, set done status
		while(1)
		{
			thisCpu->spare1 = 0xcafe;						// for debug

			status = thisCpu->status;
			if(FFT_START == (FFT_WORK_MASK & status))
			{
				thisCpu->spare1 = 0xbabecafe;				// for debug

				fftProc((void *)thisCpu->cinadrs, thisCpu->lenP2,
						thisCpu->begin, thisCpu->nextbegin,
						thisCpu->oddadd, thisCpu->startP2,
						(void *)thisCpu->twadrs,
						thisCpu->toffset, thisCpu->tinc,
						thisCpu->normalise,thisCpu->arrange,
						thisCpu->realonly, thisCpu->secondpass);

				thisCpu->spare2 = 0xcafebabe;				// for debug
				thisCpu->status &= ~FFT_WORK_MASK;			// finished

			}	// if

			// halt this CPU, will be restarted when work available
			__asm__ (" wr %g0, %asr19 ");

		}	// while

	}	// else

	return LEON3_Cpu_Index;		// should only get here for CPU 0

}	// fftCpusStart()


// see fft.h for description
void startCpus(const int whichcpus)
{
	int i;
	int LEON3_Cpu_Index = (get_asr17() >> 28) & 3;		// max 4 cpus, 0 .. 3
	int this_cpu = 1 << LEON3_Cpu_Index;

	int current_cpu = LAST_CPU;							// usually 8

	while(current_cpu)
	{
		for(i=0; i < CPU_START_PAUSE; i++){}						// pause
		if((current_cpu & whichcpus)&&(current_cpu != this_cpu))
		{
			*((int *)IRQMP_ADRS) = 0x380b0000 | current_cpu;
		}	// if

		current_cpu >>= 1;

	}	// while

	for(i=0; i < CPU_START_PAUSE; i++){}							// pause
	return;
}	// startCpus()


/*
 * INPUT ARRAY REARRANGEMENT
 */

// see fft.h for description
void fftArrangeOut(fft_cpx* cin, fft_cpx* result, int lenP2)
{

	// validation check
	if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2))
	{
		printf("\nInvalid power of 2 for array length: %d, should be between %d and %d\n",
				lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2);
		return;
	}	// if

	int len = 1 << lenP2;
	int mask = len -1;

	int i = 0;
    for(i = 0; i < len; i++){

    	/*
    	 * Find the index to swap with.
    	 * - treating index as potentially 32 bits,
    	 *   number of bits in use should be less
    	 */
    	int v = i;
    	// swap odd and even bits
    	v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
    	// swap consecutive pairs
    	v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
    	// swap nibbles ...
    	v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
    	// swap bytes
    	v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
    	// swap 2-byte long pairs
    	v = ( v >> 16             ) | ( v               << 16);

    	v >>= (32 - lenP2);				// N.B. arithmetic shift
    	v &= mask;

	    if(v >= i)					// otherwise already done
	    {
	    	result[i] = cin[v];
	    	result[v] = cin[i];
	    }	// if

    }	// for

    return;

}	// fftArrangeOut()


// see fft.h for description
void fftArrangeIn(fft_cpx* x, int lenP2)
{
	// validation check
	if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2))
	{
		printf("\nInvalid power of 2 for array length: %d, should be between %d and %d\n",
				lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2);
		return;
	}	// if

	// do it
	int len = 1 << lenP2;			// power of 2
	int mask = len - 1;

	fft_cpx temp;					// for swap

	int i = 0;
    for(i = 0; i < len; i++){		// must check full array, not just half

    	// Find the index to swap with
    	// - treating index as no more than 32 bits,
    	int v = i;
    	// swap odd and even bits
    	v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
    	// swap consecutive pairs
    	v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
    	// swap nibbles ...
    	v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
    	// swap bytes
    	v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
    	// swap 2-byte long pairs
    	v = ( v >> 16             ) | ( v               << 16);

    	v >>= (32 - lenP2);			// arithmetic shift
    	v &=  mask;					// kill off any leading 1s due to arithmetic shift

	    if(v > i)					// otherwise already done or same
	    {
	    	temp = x[i];
	    	x[i] = x[v];
	    	x[v] = temp;
	    }	// if

    }	// for

    return;

}	// fftArrangeIn()


// see fft.h for description
void fftArrangeIndex(int* mapindex, int lenP2)
{
	// validation check
	if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2))
	{
		printf("\nInvalid power of 2 for array length: %d, should be between %d and %d\n",
				lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2);
		return;
	}	// if

	// do it
	int len = 1 << lenP2;			// power of 2
	int mask = len - 1;

	int i = 0;
    for(i = 0; i < len; i++){		// must check full array, not just half

    	// Find the index to swap with
    	// - treating index as no more than 32 bits,
    	int v = i;
    	// swap odd and even bits
    	v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
    	// swap consecutive pairs
    	v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
    	// swap nibbles ...
    	v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
    	// swap bytes
    	v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
    	// swap 2-byte long pairs
    	v = ( v >> 16             ) | ( v               << 16);

    	v >>= (32 - lenP2);			// arithmetic shift
    	v &=  mask;						// kill off any leading ones

    	mapindex[i] = v;

    }	// for

    return;

}	// fftArrangeIndex()


// see fft.h for description
void fftRealOnlyArrange(fft_cpx * cin, int lenP2)
{
	int i,j;
	int halflen = 1 << (lenP2-1);

	for (i = 0; i < halflen; i++)
	{
		j = i << 1;
		cin[i].re = cin[j].re;
		cin[i].im = cin[j+1].re;
	}	// for

	for (i=halflen; i < (1 << lenP2); i++)
	{
		cin[i].re = 0;
		cin[i].im = 0;
	}	// for

}	// fftRealOnlyArrange()


/*
 * TWIDDLES ARRAY
 */

// see fft.h for description
void fillTwiddles(fft_cpx* twiddles, const int lenP2, const int type)
{
	// validation checks
	if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2))
	{
		printf("\nfillTwiddles(): Invalid power of 2 for array length: %d, should be between %d and %d\n",
				lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2);
		return;
	}	// if

    if ((0 != type)&& (1 != type))		// invalid, should be forward(0) or inverse(1)
    {
		printf("\nInvalid type for twiddles array, should be 0 for forward, 1 for inverse, actual was: 0x%x\n",
				type);
		return;
    }	// if

    /*
     * Do it - using doubles
     */
    int n = 1 << lenP2;
    double twoPiDivNsigned;

    if(1 != type)						// negative exponent for forward transform, positive for inverse
    {
        twoPiDivNsigned= -M_PI / (double)(n >> 1);  // 2*Pi/n
    } else {
        twoPiDivNsigned = M_PI / (double)(n >> 1);	// positive exponent for inverse transform
    }	// else

    double root2Inv  = 1.0/sqrt(2.0);	// quicker than sine or cosine

    /*
     * Fill in the values
     */
    int quarter;						// will be N/4, quarter way round
    int eight;							// will be N/8, eight of way around

    twiddles[0].re = 1.0;				// always same
    twiddles[0].im = 0;

    if(1 != type)						// forward FFT - N.B. using negative exponent
    {
    	quarter = n >> 2;
        twiddles[quarter].re = 0;		// quarter way around with negative exponent (0,-i)
        twiddles[quarter].im = -1;

        eight = quarter >> 1;			// eight of way around (1/root2, -i*1/root2)
        twiddles[eight].re = root2Inv;
        twiddles[eight].im = -root2Inv;

        twiddles[quarter+eight].re = -root2Inv;	// three eights of way around (-1/root2, -i*1/root2)
        twiddles[quarter+eight].im = -root2Inv;

        int j;
        for(j=1; j<eight; j++)
        {
        	double theta = (double)j * twoPiDivNsigned;	// negative

        	twiddles[j].re = cos(theta);			// positive
        	twiddles[j].im = sin(theta);			// negative

        	int pos = quarter-j;
        	twiddles[pos].re = -twiddles[j].im;
        	twiddles[pos].im = -twiddles[j].re;

        	pos = quarter+j;
           	twiddles[pos].re =  twiddles[j].im;
            twiddles[pos].im = -twiddles[j].re;

        	pos = (quarter << 1) - j;
        	twiddles[pos].re = -twiddles[j].re;
        	twiddles[pos].im =  twiddles[j].im;
        }	// for

    } else {										// inverse FFT - N.B. uses positive exponent

        quarter = n >> 2;
        twiddles[quarter].re = 0;					// quarter way around (0,i)
        twiddles[quarter].im = 1;					// N.B. positive exponent

        eight = quarter >> 1;						// eight of way around (1/root2, i*1/root2)
        twiddles[eight].re = root2Inv;
        twiddles[eight].im = root2Inv;

        twiddles[quarter+eight].re = -root2Inv;		// three eights of way around (-1/root2, i*1/root2)
        twiddles[quarter+eight].im =  root2Inv;

        int j;
        for(j=1; j<eight; j++)
        {
        	double theta = (double)j * twoPiDivNsigned;		// positive

        	twiddles[j].re = cos(theta);			// positive
        	twiddles[j].im = sin(theta);			// positive

        	int pos = quarter-j;
        	twiddles[pos].re = twiddles[j].im;
        	twiddles[pos].im = twiddles[j].re;

        	pos = quarter+j;
        	twiddles[pos].re =  -twiddles[j].im;
        	twiddles[pos].im = twiddles[j].re;

        	pos = (quarter << 1) - j;
        	twiddles[pos].re = -twiddles[j].re;
        	twiddles[pos].im =  twiddles[j].im;
        }	// for

    }	// else

}	// fillTwiddles()


/*
 * FFT PROCESSING
 */

// see fft.h for description
void fftProc(fft_cpx* cin, const int lenP2,
			 const int start, const int nextstart,
			 const int oddadd, const int startP2,
    		 const fft_cpx* twiddles,
    		 const int toffset, const int twinc,
    		 const int normalise, const int arrange,
    		 const int realonly, const int secondpass)
{
	/*
	 * Input validation
	 */
	if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2))
	{
		printf("\nfftProc():Invalid power of 2 for array length: %d, should be between %d and %d\n",
				lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2);
		return;
	}	// if

	int length = 1 << lenP2;
	int subarraylen = nextstart - start;

    if((0 > start)||(0 > nextstart)||(0 >= subarraylen))
    {
    	printf("\nInvalid input sub-array indices, start index: %d, next start: %d, sub-array length %d\n", start, nextstart, subarraylen);
    	return;
    }	// if

    if ((start >= length)||(nextstart > length))
    {
    	printf("\nSub-array indices incompatible with array length, start index: %d, next start: %d, array length: %d\n",
    			start, nextstart, length);
    	return;
    }	// if

    if (subarraylen & (subarraylen -1))	// non-zero if not power of 2
    {
    	printf("\nInvalid sub-array length, not a power of 2, start index: %d, next start: %d, sub-array length: %d\n",
    			start, nextstart, subarraylen);
    	return;
    }	// if
//printf("fftProc(): start %d, nextstart %d, oddadd %d, startP2 %d, toffset %d, tinc %d, realonly %d\n",
//			start,nextstart, oddadd, startP2, toffset, twinc, realonly);

	/*
	 * Processing
	 */
    int bi;					// index where this butterfly starts
    int ei;					// index of even
    int oi;					// index of odd

    fft_cpx temp;			// holds current twiddles value
    int tstart = toffset;	// position in twiddles array from which to begin
    int tindex;				// index into twiddles array
    int tlimit;				// tindex should be less than this
    int tinc = twinc;		// increment used stepping through twiddles array
    int lasttinc = 1;		// when tinc gets to this, on last butterfly

    int oddoffset;			// amount to add to even index to get odd index in inputarray
    int butterflysize;		// holds current butterfly size
    int bsh;    			// half of the butterfly size

    float norm;				// used to hold normalizing divisor
    register float twre;	// holds real part of twiddles entry
    register float twim;	// holds imaginary part of twiddles entry

    fft_cpx * arranged = cin;
    fft_cpx * rain = (void *)FFT_RAIN;
    int * map = (void *)FFT_MAP;

    /*
     * If this is the second pass, do post processing of FFT result obtained so far
     * N.B. the FFT data being processed is in the second half of the underlying array,
     * 		the FFT result is put in the first half of the underlying array plus the Nyquist
     * 		component at index [length], the remaining entries in the underlying array
     * 		are not updated with negative frequencies, these are just the complex conjugates
     * 		of the positive frequencies.
     */
    if(secondpass)
    {
    	float value;
    	int i = start;
    	int j = (length << 1) - start;

    	// if at start of twiddles array, avoid need to use value at index [2*length]
    	if(0 == start)
    	{
    		value =  (arranged[0].re + arranged[length].re
    		    	  + arranged[0].im + arranged[length].im)/2.0;

    		arranged[0].im = (arranged[0].im - arranged[length].im
    		     			  - arranged[0].re + arranged[length].re)/2.0;

    		arranged[0].re = value;

    		i++;
    		j--;
    	}

    	// do the remaining entries
    	for(; i < nextstart; i++)
    	{
    		value =  (arranged[i].re + arranged[j].re
    				  + twiddles[i].re*(arranged[i].im + arranged[j].im)
    				  + twiddles[i].im*(arranged[i].re - arranged[j].re))/2.0;

    	    arranged[i].im = (arranged[i].im - arranged[j].im
     			         	  + twiddles[i].im*(arranged[i].im + arranged[j].im)
     			         	  - twiddles[i].re*(arranged[i].re - arranged[j].re))/2.0;

    	    arranged[i].re = value;

    	    j--;
    	}	// for

    	// Nyquist case
       	if(length == nextstart)
    	{
       		arranged[length].re = arranged[length].re - arranged[length].im;
       		arranged[length].im = 0;
    	}	// if

    	return;
    }

    /*
     * Do FFT - only get here if this is not second pass
     *
     * N.B. If the original data was in-phase only,
     * 		the underlying array is assumed to be 2*length
     */

    // if real only is true, adjust the twiddle array steps
    if(realonly)
    {
    	tstart <<= 1;		// twiddles was set up for full-length complex array
    	tinc <<= 1;			// rather than for half length reals only
    	lasttinc <<= 1;
    }	// if

    if(arrange)
    {
    	int i;
    	for(i = start; i < nextstart; i++)
    	{
    		rain[i] = arranged[map[i]];
    	}	// for

    	arranged = rain;
    }	// if

    // double size of butterfly each time around outer loop for this sub-array
    for(butterflysize = 1 << startP2; butterflysize <= subarraylen; butterflysize <<= 1)
    {
     	bsh = butterflysize >> 1;
     	oddoffset = bsh + oddadd;

     	if(lasttinc != tinc)	// not last butterfly
     	{
     		// do all the butterflies in this sub-array for this size of butterfly
     		for( bi = start; bi < nextstart; bi += butterflysize)
     		{
     			ei = bi;
     			oi = ei + oddoffset;

     			tindex = tstart;
     			tlimit = tstart + bsh*tinc;

     			// as log n repeats, worth doing first case separately
     	    	if(0 == tindex)
     	    	{
     				tindex += tinc;

     	    		temp.re = arranged[oi].re;
     				temp.im = arranged[oi].im;

     				arranged[oi].re = arranged[ei].re - temp.re;
     				arranged[oi++].im = arranged[ei].im - temp.im;
     				arranged[ei].re += temp.re;
     				arranged[ei++].im += temp.im;
     	    	}	// if

     			// doing each butterfly
     			while(tindex < tlimit)
     			{
     				twre = twiddles[tindex].re;
     				twim = twiddles[tindex].im;

     				tindex += tinc;

     				temp.re = arranged[oi].re*twre
     						  - arranged[oi].im*twim;
     				temp.im = arranged[oi].re*twim
     				     	  + arranged[oi].im*twre;

     				arranged[oi].re = arranged[ei].re - temp.re;
     				arranged[oi++].im = arranged[ei].im - temp.im;
     				arranged[ei].re += temp.re;
     				arranged[ei++].im += temp.im;




     			}	// while

     		}	// for

     	} else {	// last butterfly, only one butterfly to do. Normalize if requested.

     		ei = start;
     		oi = ei + oddoffset;

     		tindex = tstart;
     		tlimit = tindex + lasttinc*bsh;		// lasttinc will be 1 or 2

     		if(normalise)
     		{
     			norm = (float)(realonly ?(1 << (lenP2 + 1)): 1 << lenP2);

     			// not worth doing single case separately
//    	    	if(0 == tindex)
//     	    	{
//     				tindex += tinc;
//
//     	    		temp.re = arranged[oi].re;
//     				temp.im = arranged[oi].im;
//
//     				arranged[oi].re = (arranged[ei].re - temp.re)/norm;
//     				arranged[oi++].im = (arranged[ei].im - temp.im)/norm;
//     				arranged[ei].re += temp.re;
//     				arranged[ei].re /= 2.0;
//     				arranged[ei].im += temp.im;
//     				arranged[ei++].im /= 2.0;
//     	    	}	// if

     			while(tindex < tlimit)
     			{
//     				twre = twiddles[tindex].re;		// not faster
//     				twim = twiddles[tindex].im;
//
//     				tindex += lasttinc;
//
//     				temp.re = arranged[oi].re*twre
//     						  - arranged[oi].im*twim;
//     				temp.im = arranged[oi].re*twim
//     				     	  + arranged[oi].im*twre;

     				temp.re =  arranged[oi].re*twiddles[tindex].re
     						   - arranged[oi].im*twiddles[tindex].im;
     				temp.im =  arranged[oi].re*twiddles[tindex].im
     						   + arranged[oi].im*twiddles[tindex].re;

     				tindex += lasttinc;

     				arranged[oi].re = (arranged[ei].re - temp.re)/norm;
     				arranged[oi++].im = (arranged[ei].im - temp.im)/norm;
     				arranged[ei].re += temp.re;
     				arranged[ei].re /= norm;
     				arranged[ei].im += temp.im;
     				arranged[ei++].im /= norm;

     			}	// while

     		} else {		// not normalizing

     			// not worth doing single case separately
//     			if(0 == tindex)
//     			{
//     				tindex += tinc;
//
//     		   		temp.re = arranged[oi].re;
//     			    temp.im = arranged[oi].im;
//
//     			    arranged[oi].re = arranged[ei].re - temp.re;
//     			    arranged[oi++].im = arranged[ei].im - temp.im;
//     			    arranged[ei].re += temp.re;
//     			    arranged[ei++].im += temp.im;
//     			}	// if

     			while(tindex < tlimit)
     			{
//     				twre = twiddles[tindex].re;	// not faster
//     				twim = twiddles[tindex].im;
//
//     				tindex += lasttinc;
//
//     				temp.re = arranged[oi].re*twre
//     						  - arranged[oi].im*twim;
//     				temp.im = arranged[oi].re*twim
//     				     	  + arranged[oi].im*twre;

     				temp.re =  arranged[oi].re*twiddles[tindex].re
     			     		   - arranged[oi].im*twiddles[tindex].im;
     			    temp.im =  arranged[oi].re*twiddles[tindex].im
     			     		   + arranged[oi].im*twiddles[tindex].re;

    				tindex += lasttinc;

     			    arranged[oi].re = arranged[ei].re - temp.re;
     			    arranged[oi++].im = arranged[ei].im - temp.im;
     			    arranged[ei].re += temp.re;
     			    arranged[ei++].im += temp.im;

     			}	// while

     		}	// else

     		// if original data was in-phase only, with no quadrature component,
     		// make a copy of the FFT result to be ready for post-processing
     		// N.B. the underlying array needs to be twice the current array size
     		if(realonly)
     		{
    			int i,j,k,m;

     			// copying originals into second half of underlying array
     			j = start + length;
     			m = start + oddoffset;
     			k = start + oddoffset + length;

     			for(i = start; i < nextstart; i++){
     				arranged[j].re 	 = arranged[i].re;
     				arranged[j++].im = arranged[i].im;

     				arranged[k].re   = arranged[m].re;
     				arranged[k++].im   = arranged[m++].im;
     			}	// for

     		}	// if

     	}	 // else

     	tinc >>= 1;		// as the butterfly size is doubled the steps are halved

    }	// for

}	// fftProc()


// see fft.h for description
void fftMulti(fft_cpx* cin, const int lenP2,
			  const fft_cpx* twiddles,
    		  const int whichcpus, const int normalise,
    		  const int arrange, const int realonly)
{
	/*
	 * Input validation
	 */
	if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2))
	{
		printf("\nfftMulti():Invalid power of 2 for array length: %d, should be between %d and %d\n",
				lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2);
		return;
	}	// if

	if ((0 == whichcpus)||(whichcpus & ~CPUS_MASK))
	{
	    printf("\nInvalid choice of CPUs, input : 0x%x, must be between 0 and 0x%x\n", whichcpus, CPUS_MASK);
	    return;
	}	// if


	/*
	 * Processing
	 */
	int length = 1 << lenP2;

	int blocksize;

	int numcpus = getNumCpus(whichcpus);
	int first,second,next;

	fft_cpx * use;
	if(arrange)
	{
		use = (void *)FFT_RAIN;
	} else {
		use = cin;
	}	// else

	switch (numcpus)
	{
		case 1:
			blocksize = 1 << lenP2;			// length

			// main processing
			giveToCpu(whichcpus,
					  cin, lenP2,
					  0, blocksize,
					  0, 1,
					  twiddles, 0, 1 << (lenP2-1),
					  normalise, arrange,
					  realonly,0);

			// MUST WAIT FOR THiS TO HAVE FINISHED
			fftWait(whichcpus);

			if(realonly)
			{
				giveToCpu(whichcpus,
						  use, lenP2,
						  0, blocksize,
						  0, 1,
						  twiddles, 0, 2,
						  normalise, 0,
						  realonly,1);
			}	// if

			// MUST WAIT FOR THiS TO HAVE FINISHED
			fftWait(whichcpus);
			break;

		case 2:
		case 3:
			blocksize = 1 << (lenP2 - 1);	// half of the length

			// get the CPU to use
			if (0 != (whichcpus & 8))first = 8;			// 1xxx
			else if (0 != (whichcpus & 4))first = 4;	// 01xx
			else first = 2;								// 001x

			next = first >> 1;

			if (0 != (whichcpus & next))second = next;
			else if (0 != (whichcpus & (next >> 1)))second = (next >> 1);
			else second = 1;

			// main processing - most of the time is spent here
			giveToCpu(first,
					  cin, lenP2,
					  0, blocksize,
					  0, 1,
					  twiddles, 0, 1 << (lenP2-1),
					  normalise, arrange,
					  realonly, 0);

			giveToCpu(second,
					  cin, lenP2,
					  blocksize, length,
					  0, 1,
					  twiddles, 0, 1 << (lenP2-1),
					  normalise, arrange,
					  realonly,0);

			// MUST WAIT FOR THESE TO HAVE FINISHED
			fftWait(first|second);			// wait for both to finish

			// combine results
			giveToCpu(first,
					  use, lenP2,
					  0, blocksize,
					  blocksize >> 1, lenP2-1,
					  twiddles, 0, 1,
					  normalise, 0,
					  realonly, 0);

			giveToCpu(second,
					  use, lenP2,
					  blocksize >> 1, (blocksize >> 1) + blocksize,
					  blocksize >> 1, lenP2-1,
					  twiddles, blocksize >> 1, 1,
					  normalise, 0,
					  realonly, 0);

			// MUST WAIT FOR THESE TO HAVE FINISHED
			fftWait(first|second);			// wait for both to finish

			// complete the real only processing
			if(realonly)
			{
				giveToCpu(first,
						  use, lenP2,
						  0, blocksize,
						  0, 1,
						  twiddles, 0, 2,
						  normalise, 0,
						  realonly, 1);

				giveToCpu(second,
						  use, lenP2,
						  blocksize, length,
						  0, 1,
						  twiddles, blocksize, 2,
						  normalise, 0,
						  realonly, 1);

				// MUST WAIT FOR THESE TO HAVE FINISHED
				fftWait(first|second);			// wait for both to finish
			}	// if

			break;

		case 4:

			blocksize = 1 << (lenP2 - 2);	// quarter of the length

	    	// main processing - most of the time is spent here
	    	giveToCpu(8,
	    			  cin, lenP2,
	    			  0, blocksize,
	    			  0, 1,
	    			  twiddles, 0, 1 << (lenP2-1),
	    			  normalise, arrange,
	    			  realonly, 0);
	    	giveToCpu(4,
	    			  cin, lenP2,
	    			  blocksize, blocksize << 1,
	    			  0, 1,
	    			  twiddles, 0, 1 << (lenP2-1),
	    			  normalise, arrange,
	    			  realonly, 0);
	    	giveToCpu(2,
	    			  cin, lenP2,
	    			  blocksize<<1, (blocksize << 1) + blocksize,
	    			  0, 1,
	    			  twiddles, 0, 1 << (lenP2-1),
	    			  normalise, arrange,
	    			  realonly, 0);
	    	giveToCpu(1,
	    			  cin, lenP2,
	    			  (blocksize << 1) + blocksize, length,
	    			  0, 1,
	    			  twiddles, 0, 1 << (lenP2-1),
	    			  normalise, arrange,
	    			  realonly, 0);

			// MUST WAIT FOR THESE TO HAVE FINISHED
			fftWait(0xf);			// wait for all to finish

			// combine results
	    	giveToCpu(8,
	    			  use, lenP2,
	    			  0, blocksize,
	    			  blocksize>>1, lenP2 - 2,
	    			  twiddles, 0, 2,
	    			  normalise, 0,
	    			  realonly, 0);
	    	giveToCpu(4,
	    			  use, lenP2,
	    			  blocksize>>1, (blocksize >> 1) + blocksize,
	    			  blocksize >> 1, lenP2 -2,
	    			  twiddles, blocksize, 2,
	    			  normalise, 0,
	    			  realonly, 0);
	    	giveToCpu(2,
	    			  use, lenP2,
	    			  blocksize<<1, 3*blocksize,
	    			  blocksize >> 1, lenP2 -2,
	    			  twiddles, 0, 2,
	    			  normalise, 0,
	    			  realonly, 0);
	    	giveToCpu(1,
	    			  use, lenP2,
	    			  (blocksize <<1) + (blocksize >> 1), 3 * blocksize + (blocksize >> 1),
	    			  blocksize >> 1, lenP2 -2,
	    			  twiddles, blocksize, 2,
	    			  normalise, 0,
	    			  realonly, 0);

			// MUST WAIT FOR THESE TO HAVE FINISHED
			fftWait(0xf);			// wait for all to finish

			// combine again
	        giveToCpu(8,
	        		  use, lenP2,
	        		  0, blocksize,
	        		  3*(blocksize >> 1), lenP2-2,
	        		  twiddles, 0, 1,
	        		  normalise, 0,
	        		  realonly, 0);
	        giveToCpu(4,
	        		  use, lenP2,
	        		  blocksize >> 1,(blocksize >> 1) + blocksize,
	        		  3*(blocksize >> 1), lenP2-2,
	        		  twiddles, blocksize>>1, 1,
	        		  normalise, 0,
	        		  realonly, 0);
	        giveToCpu(2,
	        		  use, lenP2,
	        		  blocksize,(blocksize<<1),
	        		  3*(blocksize >> 1), lenP2-2,
	        		  twiddles, blocksize, 1,
	        		  normalise, 0,
	        		  realonly, 0);
	        giveToCpu(1,
	        		  use, lenP2 ,
	        		  blocksize + (blocksize >> 1),(blocksize << 1) + (blocksize >>1),
	        		  3*(blocksize >> 1), lenP2-2,
	        		  twiddles, 3*(blocksize >> 1), 1,
	        		  normalise, 0,
	        		  realonly, 0);

			// MUST WAIT FOR THESE TO HAVE FINISHED
			fftWait(0xf);			// wait for all to finish

			// complete the real only processing
			if(realonly)
			{
				giveToCpu(8,
						  use, lenP2,
					      0, blocksize,
					      0, 1,
					      twiddles, 0, 2,
					      normalise, 0,
					      realonly, 1);

				giveToCpu(4,
						  use, lenP2,
					      blocksize, blocksize << 1,
					      0, 1,
					      twiddles, 0, 2,
					      normalise, 0,
					      realonly, 1);

				giveToCpu(2,
						  use, lenP2,
					      blocksize<<1, 3*blocksize,
					      0, 1,
					      twiddles, 0, 2,
					      normalise, 0,
					      realonly, 1);

				giveToCpu(1,
						  use, lenP2 ,
					      3*blocksize,blocksize << 2,
					      0, 1,
					      twiddles, 0, 2,
					      normalise, 0,
					      realonly, 1);

				// MUST WAIT FOR THESE TO HAVE FINISHED
				fftWait(first|second);			// wait for both to finish
			}	// if

			break;

	}	// switch

}	// fftMulti()


// see fft.h for description
void giveToCpu(const int which,								// 1,2,4 or 8
			   fft_cpx* cin, const int lenP2,				// input array
			   const int begin, const int nextbegin,		// part indices
			   const int oddadd, const int startP2,
			   const fft_cpx* twiddles,
			   const int toffset, const int twinc,
			   const int normalise, const int arrange,
			   const int realonly, const int secondpass)
{
	if(0 == which) return;					// nothing to do this time


	if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2))
	{
		printf("\ngiveToCpu(): Invalid power of 2 for array length: %d, should be from %d to %d\n",
				lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2);
		return;
	}	// if

	// get the index into the FFT_FLAGS
    int cpuindex;
    if(1 == which) cpuindex = 0;
    else if(2 == which) cpuindex = 1;
    else if(4 == which) cpuindex = 2;
    else if(8 == which) cpuindex = 3;

    else return;

	volatile fft_cpu *thisCpu = (void *)(FFT_FLAGS + cpuindex*sizeof(fft_cpu));

	thisCpu->cinadrs	= (unsigned int)cin;
	thisCpu->lenP2		= (unsigned int)lenP2;
	thisCpu->begin		= (unsigned int)begin;
	thisCpu->nextbegin	= (unsigned int)nextbegin;
	thisCpu->oddadd		= (unsigned int)oddadd;
	thisCpu->startP2	= (unsigned int)startP2;
	thisCpu->twadrs 	= (unsigned int)twiddles;
	thisCpu->toffset	= (unsigned int)toffset;
	thisCpu->tinc		= (unsigned int)twinc;
	thisCpu->normalise 	= (unsigned int)normalise;
	thisCpu->arrange 	= (unsigned int)arrange;
	thisCpu->realonly   = (unsigned int)realonly;
	thisCpu->secondpass = (unsigned int)secondpass;

	thisCpu->spare2		= 0xa5a5a5a5;					// for debug
	thisCpu->status		= (unsigned int)FFT_START;		// do this last

    // if the CPU is the one running this code, do the FFT here
    if(START_CPU_INDEX == cpuindex)
    {
    	thisCpu->status |= FFT_START;
		thisCpu->spare1 = 0xbabecafe;				// for debug

    	fftProc((void *)thisCpu->cinadrs, thisCpu->lenP2,
				thisCpu->begin, thisCpu->nextbegin,
				thisCpu->oddadd, thisCpu->startP2,
				(void *)thisCpu->twadrs,
				thisCpu->toffset, thisCpu->tinc,
				thisCpu->normalise, thisCpu->arrange,
				thisCpu->realonly, thisCpu->secondpass);

    	thisCpu->spare2 = 0xcafebabe;				// for debug
		thisCpu->status &= ~FFT_WORK_MASK;			// finished

    } else {
    	*((int *)IRQMP_ADRS) = 0x380b0000 | which;
    }	// else

}	// giveToCpu()


// see fft.h for description
void fftWait(int whichcpus){

	// other CPUS loop forever, check for work, do it, set done status

	volatile fft_cpu *cpu0 = (void *)(FFT_FLAGS);
	volatile fft_cpu *cpu1 = (void *)(FFT_FLAGS + sizeof(fft_cpu));
	volatile fft_cpu *cpu2 = (void *)(FFT_FLAGS + 2*sizeof(fft_cpu));
	volatile fft_cpu *cpu3 = (void *)(FFT_FLAGS + 3*sizeof(fft_cpu));

//	volatile fft_cpu * cpus = (void *)FFT_FLAGS;
	int notdone = 0;
	do
	{
		notdone =  (8 & whichcpus)? FFT_WORK_MASK & cpu3->status : 0;
		notdone |= (4 & whichcpus)? FFT_WORK_MASK & cpu2->status : 0;
		notdone |= (2 & whichcpus)? FFT_WORK_MASK & cpu1->status: 0;
		notdone |= (1 & whichcpus)? FFT_WORK_MASK & cpu0->status: 0;

	} while (notdone);

	return;

}	// fftWait()

// see fft.h for description
void fftRealReturn(fft_cpx * cin, const int halflenP2, fft_cpx * twiddles)
{
	fft_scalar value;
	int i,j;

	int length = 1 << halflenP2;

	// results overwrite originals that are needed subsequently,
	// so copy originals into second half of underlying array
	j = length;
	for(i =0; i <= length; i++){
		cin[j].re 	 = cin[i].re;
		cin[j++].im  = cin[i].im;
	}	// for

	j = (length << 1) +1;
	for(i = 0; i < length; i++)
	{
		j--;
		value =  (cin[i].re + cin[j].re
			      + twiddles[i].re*(cin[i].im + cin[j].im)
			      + twiddles[i].im*(cin[i].re - cin[j].re))/2.0;

		cin[i].im = (cin[i].im - cin[j].im
			         + twiddles[i].im*(cin[i].im + cin[j].im)
			         - twiddles[i].re*(cin[i].re - cin[j].re))/2.0;

		cin[i].re = value;

	}	// for

	// Nyquist case
	// if they existed, twiddles[length].re would be -1, twiddles[length].im would be 0;
	cin[length].re = cin[length].re - cin[length].im;
	cin[length].im = 0;

	// copy positive frequency  results to negative, doing complex conjugate
	for(i = 1; i < length ; i++)
 	{
 		cin[length+i].re = cin[length-i].re;
 		cin[length+i].im = -cin[length-i].im;
 	}	// for

	return;
}	// fftRealReturn()


/*
 * UTILITIES
 */

// see fft.h for description
int getNumCpus(const int whichcpus)
{
	// set up count of cpus to use
	int numcpus = 0;				// how many cpus to use

	numcpus += (whichcpus & 1)? 1 : 0;
	numcpus += (whichcpus & 2)? 1 : 0;
	numcpus += (whichcpus & 4)? 1 : 0;
	numcpus += (whichcpus & 8)? 1 : 0;

    return numcpus;

}	// getNumCpus


// see fft.h for description
int settingsIncorrect(const int lenP2, int *cpusrequest, const int numruns, const int type, int* lines)
{
	int result = 0;
	// array size is limited by the way memory is being allocated at present
	if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2))
	{
		printf("\nInvalid power of 2 for array length: %d, should be between %d and %d\n",
				lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2);
		result |= 1;
	}	// if

	int whichcpus = *cpusrequest;
	if((0 == whichcpus)||(whichcpus & ~CPUS_MASK))
	{
		printf("\nInvalid choice of CPUs: 0x%x, must be from 1 to 0x%x\n",
				whichcpus, CPUS_MASK);
		result |= 1;
	}	// if

	// get the count of cpus being requested
	int numcpus = getNumCpus(whichcpus);

	// only allow 1, 2 or 4 cpus, if 3 deselect the lowest order one
	if(3 == numcpus)
	{
	    if (whichcpus & 1) whichcpus &= 0xe;		// deselect CPU 0
	    else whichcpus &= 0xd;						// deselect CPU 1
	    *cpusrequest = whichcpus;
	    numcpus = 2;
	    printf("\nChoice of 3 CPUs invalid, will use these 2 CPUs : 0x%x\n", whichcpus);
	}	// if

	// find how many CPUs are present on system
	volatile int status = *(int *)IRQMP_ADRS;		// current system configuration
	int statuscpus = (((status & CPUS_PM1_MASK)>>CPUS_PM1_SHIFTS)&0xf) + 1;
	printf("\nMain, Status: 0x%x, CPUs present %d,  CPUs requested: %d, WhichCPUs: 0x%x\n",
			status,statuscpus,numcpus,whichcpus);	// debug

	if((0 == numcpus)||(numcpus > statuscpus))		// sanity check
	{
		printf("Abandoning, number of CPUs requested: %d, number present %d",numcpus, statuscpus);
		result |= 1;
	}	// if

	// check the type
	if((0 != type)&&(1 != type))
	{
		printf("Abandoning, invalid type input: %d, should be 0 for forward FFT or 1 for inverse FFT\n",type);
		result |= 1;
	}	// if

	// check the choice of number of lines to display
	if(0 >= *lines)
	{
		printf("Number of lines chosen was %d, no data or result will be displayed\n",*lines);
	} else if ((1 << lenP2) < *lines){
		printf("Number of lines chosen %d exceeds array length %d, array length will be used\n",*lines,1<<lenP2);
		*lines = 1 << lenP2;
	}
	return result;

}	// settingsIncorrect()


// see fft.h for description
void showC(const fft_cpx* in, const int length, const double tolerance)
{
	int i=0;
	float temp1 = 0.0;
	float temp2 = 0.0;
	printf("Complex numbers array, showing first %d elements\n", length);

	for (i=0; i<length; i++) {

	    temp1 = in[i].re;
	    if(  ((0.0 < temp1)&&( tolerance > temp1))
	       ||((0.0 > temp1)&&(-tolerance < temp1))){
	   	   temp1 = 0.0;
	    }	// if

	    temp2 = in[i].im;
	    if(  ((0 < temp2)&&( tolerance > temp2))
	       ||((0 > temp2)&&(-tolerance < temp2))){
	  	   temp2 = 0.0;
	    }	// if

	    printf("%d:   %g      %g\n", i, temp1, temp2);
	}	// for

}	// showC()


// see fft.h for description
void show8C(const fft_cpx* in, const int length, const double tolerance, const int realonly)
{
	if(32 > length)
	{
		showC(in,length,tolerance);
	}	// if

	int i,k;
	fft_cpx temp;
	float temp1 = 0.0;
	float temp2 = 0.0;
	printf("Complex numbers array, showing certain elements\n\n");

	int nyindex = length>>1;
	int begins[4] = {0, nyindex - 8, nyindex, length-8};

	for(k =0; k<4; k++)
	{
		for (i=0; i<8; i++) {

			temp  = in[begins[k]+i];
			temp1 = temp.re;
			if(  ((0.0 < temp1)&&( tolerance > temp1))
			   ||((0.0 > temp1)&&(-tolerance < temp1))){
		   	   temp1 = 0.0;
			}	// if

			temp2 = temp.im;
			if(  ((0 < temp2)&&( tolerance > temp2))
		       ||((0 > temp2)&&(-tolerance < temp2))){
		  	   temp2 = 0.0;
		    }	// if

		    printf("%d:\t\t\t%g      %g\n", begins[k]+i, temp1, temp2);

		    if((0 == begins[k]+i)||(nyindex == begins[k]+i)) printf("\n");

		    if(realonly && (nyindex == begins[k] + i))
		    {
		    	printf("In-phase data only, last frequency shown above is Nyquist\n");
		    	printf("Negative frequencies not shown (complex conjugates of positive frequencies)\n\n");
		    	break;	// for loop
		    }	// if

		}	// for

		printf("\n");
		if(realonly && (2 == k)) break;

	}	// for

}	// show8C()