FFTCPU0.c
Jump to navigation
Jump to search
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.
/* ============================================================================ Name : FFTCPU0.c Author : Michael Version : Copyright : Copyright OCE Technology Description : Multicore FFT This program demonstrates the use of 1, 2, or 4 CPUs on a S698PM multi-core processor to calculate a Fast Fourier Transform (FFT). A selection of different input arrays is provided, additional input arrays can be created easily. At present the input array length must be a power of 2, and the scalar type must be C float. The functions used are defined in fft.h and documented there. Please consult fft.h for more information ============================================================================ */ #include <stdio.h> #include <stdlib.h> #include "fft.h" /* * main() * * Usage: * * Assign values to each of the variables below * * lenP2 the power of 2 that gives the length of the input array * (at present from 5 to 17, i.e. length 32 to 131,072) * * whichcpus the CPUs to use, from 1 to 0xf (binary xxxx, 1 = use) * * numruns number of test runs - timings are given for total runs * * type 0 for forward FFT, 1 for inverse FFT * * normalise 0 for not normalized, 1 for normalized * * arrange 0 if data already arranged, 1 if need to arrange * * realonly 0 for (in-phase, quadrature) input, 1 for (in-phase, all 0); * * displayLines number of lines of data to display * * Set up the complex data input in cin. * (a number of choices are already set up and commented out, it is simple to add more) * * Compile the program. * * Use the DMON script provided to load and run the program * This also sets up the CPU stacks and enables the L2 cache. * * N.B. L2 cache is disabled by default, * if not switched on timings are affected dramatically. * */ int main(int argc,char ** argv) { /* * Initial entry point for all CPUs -- DO NOT CHANGE */ if(START_CPU_INDEX != fftCpusStart()) // N.B. only START_CPU returns from this call { printf("\nAbandoning program, startup problem\n"); return 0; } // if /* * START HERE - select input size, CPUs to use, number of test runs */ int lenP2 = 12; // input array length is this power of 2 (from 5 to 17 at present) int whichcpus = 0xf; // CPUs to use, hex value xxxx, x = 1 to turn on, 0xf for all on const int numruns = 200; // number of test runs const int type = 0; // 0 for forward FFT, 1 for inverse FFT const int normalise = 0; // 0 for not normalized, 1 for normalized const int arrange = 1; // 0 for don't arrange, 1 for arrange const int realonly = 0; // 0 for (in-phase, quadrature), 1 for (in-phase,all 0); int displayLines = 64; // number of lines of the data to display, if enabled below // check input and correct if possible, give message and exit if settings incorrect if(settingsIncorrect(lenP2, &whichcpus, numruns, type, &displayLines)) return 0; /* * VARIOUS EXAMPLES * Example lengths depend on lenP2 above * The examples not in use are commented out * Other input choices can be set up as (in-phase, quadrature)in cin as desired * N.B. If realonly is not selected both components should be set up, * otherwise previous data may cause anomalous results. * If realonly is selected, the quadrature part of the input is overwritten * with 0s. */ fft_cpx * cin = (void *)FFT_CIN; // start of complex number input data array int length = 1 << lenP2; // length of complex numbers input array int i; // test: combination of frequencies and phases //for (i=0;i<length;++i) { // cin[i].re = 1.0 + sin(M_PI*i/(length >> 1)) // in-phase part // + 3*cos(M_PI*2*i/(length >> 1)) // + 2*cos(M_PI*6*i/(length >> 1)) // + 7*cos(M_PI*((length >> 1) - 1)*i/(length >> 1)) // + 5*cos(M_PI*i); // Nyquist // cin[i].im = sin(M_PI*3*i/(length >> 1)); // quadrature part // } // for // test: ramp // for (i=0;i<length;++i) { // cin[i].re = i; // cin[i].im = 0; // } // for // test: random // srand(time(0)); // for (i=0;i<length;++i) { // cin[i].re = rand(); // cin[i].im = 0; // } // for // square wave for (i=0;i<(length >> 1);++i) { cin[i].re = 3; cin[i].im = 0; } // for for (i=(length >> 1); i < length;++i) { cin[i].re = 0; cin[i].im = 0; } // for // show the input data // printf("Complex data:\n"); // showC(cin, displayLines, 0); /* * Do FFT test * Does numruns, first with one processor, then with the requested processors (whichcpus) */ // set up the twiddles array - its length is half that of the input array. fft_cpx * tw = (void *)FFT_TWIDDLES; // preassigned space in SRAM starting at this address fillTwiddles(tw,lenP2,type); // 0 for forward FFT, 1 for inverse FFT // if data is in-phase only, i.e. no quadrature component, pack into first half if(realonly) { fftRealOnlyArrange(cin,lenP2); lenP2 -= 1; // half the size, but leave value of length as is } // if // if need to arrange, set up mapping array - N.B. it is its own inverse if(arrange) { fftArrangeIndex((void *)FFT_MAP, lenP2); } // if // set up a pointer for rearranged input and clear it // if 'arrange' set result will be here fft_cpx * rain = (void *)FFT_RAIN; // preassigned space in SRAM starting at this address for(i = 0; i < length; i++) { rain[i].re = 0; rain[i].im = 0; } // int map[1 << lenP2]; // fftArrangeIndex(map, lenP2); // for (i=0;i<length;++i) { // rain[map[i]] = cin[i]; // } // for // start the cpus being used other than this one - they will halt waiting for work startCpus(whichcpus); // get ready to time clock_t start, finish; // start and finish times unsigned long single, multiple; // times with single, multiple C{Us printf("\n\nDoing %d runs of %d length FFT with 1 processor\n", numruns, length); start = clock(); for(i = 0; i < numruns; i++) { if(arrange) // will do rearrangement in parallel { // do it using this CPU, the startup CPU, usually CPU 0 fftMulti(cin, lenP2, tw, 1<<START_CPU_INDEX, normalise, arrange, realonly); } else { // first, rearrange the input into rain fftArrangeOut(cin, rain,lenP2); // do it using this CPU, the startup CPU, usually CPU 0 fftMulti(rain, lenP2, tw, 1<<START_CPU_INDEX, normalise, arrange, realonly); } // else } // for finish = clock(); single = finish - start; printf("\n\nDone %d runs of %d length FFT with 1 processor, time: %ld\n", numruns, length, single); int numcpus = getNumCpus(whichcpus); // how many CPUs being used printf("\nDoing %d runs of %d length FFT with %d processors\n", numruns, length, numcpus); start = clock(); for(i = 0; i < numruns; i++) { if(arrange) // will do rearrangement in parallel { // do it using selected cpus fftMulti(cin, lenP2, tw, whichcpus, normalise, arrange, realonly); } else { // first, rearrange the input fftArrangeOut(cin, rain, lenP2); fftMulti(rain, lenP2, tw, whichcpus, normalise, arrange, realonly); } // else } // for finish = clock(); multiple = finish - start; /* * FINISH - show FFT result and times */ printf("Complex FFT:\n"); show8C(rain, length, 0.001, realonly); printf("\n\n%d runs of %d length FFT with %d processor, time: %ld\n", numruns, length, 1, single); printf("\n%d runs of %d length FFT with %d processors, time: %ld\n", numruns, length, numcpus, multiple); if(realonly) { printf("\nSpeedup factor for %d real data with %d processors: %2.2f\n", length, numcpus, ((float)single)/(float)multiple); } else { printf("\nSpeedup factor for %d (in-phase,quadrature) with %d processors: %2.2f\n", length, numcpus, ((float)single)/(float)multiple); } // else return 0; } // main() // see fft.h for description static inline unsigned int get_asr17(void) { unsigned int reg; __asm__ (" mov %%asr17, %0 " : "=r"(reg) :); return reg; } // see fft.h for description int fftCpusStart(void) { volatile unsigned int status = 0; volatile fft_cpu *thisCpu; // find out which CPU is running int LEON3_Cpu_Index = (get_asr17() >> 28) & 3; // max 4 cpus, 0 .. 3 // set up reference to data for this CPU thisCpu = (void *)(FFT_FLAGS + LEON3_Cpu_Index*sizeof(fft_cpu)); // if start CPU (usually CPU0), initialize flags area if (START_CPU_INDEX == LEON3_Cpu_Index){ // START_CPU (usually 0) initializes settings and returns volatile fft_cpu *thisCpu; int i = 0; for(i = 0; i < MAX_CPUS; i++) { thisCpu = (void *)(FFT_FLAGS + i*sizeof(fft_cpu)); thisCpu->status = 0; thisCpu->cinadrs = 0; thisCpu->lenP2 = 0; thisCpu->begin = 0; thisCpu->nextbegin = 0; thisCpu->oddadd = 0; thisCpu->startP2 = 0; thisCpu->twadrs = 0; thisCpu->toffset = 0; thisCpu->tinc = 0; thisCpu->normalise = 0; thisCpu->arrange = 0; thisCpu->realonly = 0; thisCpu->spare1 = 0x01010101; thisCpu->spare2 = 0x10101010; } // for }else { // other CPUS loop forever, check for work, do it, set done status while(1) { thisCpu->spare1 = 0xcafe; // for debug status = thisCpu->status; if(FFT_START == (FFT_WORK_MASK & status)) { thisCpu->spare1 = 0xbabecafe; // for debug fftProc((void *)thisCpu->cinadrs, thisCpu->lenP2, thisCpu->begin, thisCpu->nextbegin, thisCpu->oddadd, thisCpu->startP2, (void *)thisCpu->twadrs, thisCpu->toffset, thisCpu->tinc, thisCpu->normalise,thisCpu->arrange, thisCpu->realonly, thisCpu->secondpass); thisCpu->spare2 = 0xcafebabe; // for debug thisCpu->status &= ~FFT_WORK_MASK; // finished } // if // halt this CPU, will be restarted when work available __asm__ (" wr %g0, %asr19 "); } // while } // else return LEON3_Cpu_Index; // should only get here for CPU 0 } // fftCpusStart() // see fft.h for description void startCpus(const int whichcpus) { int i; int LEON3_Cpu_Index = (get_asr17() >> 28) & 3; // max 4 cpus, 0 .. 3 int this_cpu = 1 << LEON3_Cpu_Index; int current_cpu = LAST_CPU; // usually 8 while(current_cpu) { for(i=0; i < CPU_START_PAUSE; i++){} // pause if((current_cpu & whichcpus)&&(current_cpu != this_cpu)) { *((int *)IRQMP_ADRS) = 0x380b0000 | current_cpu; } // if current_cpu >>= 1; } // while for(i=0; i < CPU_START_PAUSE; i++){} // pause return; } // startCpus() /* * INPUT ARRAY REARRANGEMENT */ // see fft.h for description void fftArrangeOut(fft_cpx* cin, fft_cpx* result, int lenP2) { // validation check if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2)) { printf("\nInvalid power of 2 for array length: %d, should be between %d and %d\n", lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2); return; } // if int len = 1 << lenP2; int mask = len -1; int i = 0; for(i = 0; i < len; i++){ /* * Find the index to swap with. * - treating index as potentially 32 bits, * number of bits in use should be less */ int v = i; // swap odd and even bits v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); // swap consecutive pairs v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); // swap nibbles ... v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); // swap bytes v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); // swap 2-byte long pairs v = ( v >> 16 ) | ( v << 16); v >>= (32 - lenP2); // N.B. arithmetic shift v &= mask; if(v >= i) // otherwise already done { result[i] = cin[v]; result[v] = cin[i]; } // if } // for return; } // fftArrangeOut() // see fft.h for description void fftArrangeIn(fft_cpx* x, int lenP2) { // validation check if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2)) { printf("\nInvalid power of 2 for array length: %d, should be between %d and %d\n", lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2); return; } // if // do it int len = 1 << lenP2; // power of 2 int mask = len - 1; fft_cpx temp; // for swap int i = 0; for(i = 0; i < len; i++){ // must check full array, not just half // Find the index to swap with // - treating index as no more than 32 bits, int v = i; // swap odd and even bits v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); // swap consecutive pairs v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); // swap nibbles ... v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); // swap bytes v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); // swap 2-byte long pairs v = ( v >> 16 ) | ( v << 16); v >>= (32 - lenP2); // arithmetic shift v &= mask; // kill off any leading 1s due to arithmetic shift if(v > i) // otherwise already done or same { temp = x[i]; x[i] = x[v]; x[v] = temp; } // if } // for return; } // fftArrangeIn() // see fft.h for description void fftArrangeIndex(int* mapindex, int lenP2) { // validation check if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2)) { printf("\nInvalid power of 2 for array length: %d, should be between %d and %d\n", lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2); return; } // if // do it int len = 1 << lenP2; // power of 2 int mask = len - 1; int i = 0; for(i = 0; i < len; i++){ // must check full array, not just half // Find the index to swap with // - treating index as no more than 32 bits, int v = i; // swap odd and even bits v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); // swap consecutive pairs v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); // swap nibbles ... v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); // swap bytes v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); // swap 2-byte long pairs v = ( v >> 16 ) | ( v << 16); v >>= (32 - lenP2); // arithmetic shift v &= mask; // kill off any leading ones mapindex[i] = v; } // for return; } // fftArrangeIndex() // see fft.h for description void fftRealOnlyArrange(fft_cpx * cin, int lenP2) { int i,j; int halflen = 1 << (lenP2-1); for (i = 0; i < halflen; i++) { j = i << 1; cin[i].re = cin[j].re; cin[i].im = cin[j+1].re; } // for for (i=halflen; i < (1 << lenP2); i++) { cin[i].re = 0; cin[i].im = 0; } // for } // fftRealOnlyArrange() /* * TWIDDLES ARRAY */ // see fft.h for description void fillTwiddles(fft_cpx* twiddles, const int lenP2, const int type) { // validation checks if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2)) { printf("\nfillTwiddles(): Invalid power of 2 for array length: %d, should be between %d and %d\n", lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2); return; } // if if ((0 != type)&& (1 != type)) // invalid, should be forward(0) or inverse(1) { printf("\nInvalid type for twiddles array, should be 0 for forward, 1 for inverse, actual was: 0x%x\n", type); return; } // if /* * Do it - using doubles */ int n = 1 << lenP2; double twoPiDivNsigned; if(1 != type) // negative exponent for forward transform, positive for inverse { twoPiDivNsigned= -M_PI / (double)(n >> 1); // 2*Pi/n } else { twoPiDivNsigned = M_PI / (double)(n >> 1); // positive exponent for inverse transform } // else double root2Inv = 1.0/sqrt(2.0); // quicker than sine or cosine /* * Fill in the values */ int quarter; // will be N/4, quarter way round int eight; // will be N/8, eight of way around twiddles[0].re = 1.0; // always same twiddles[0].im = 0; if(1 != type) // forward FFT - N.B. using negative exponent { quarter = n >> 2; twiddles[quarter].re = 0; // quarter way around with negative exponent (0,-i) twiddles[quarter].im = -1; eight = quarter >> 1; // eight of way around (1/root2, -i*1/root2) twiddles[eight].re = root2Inv; twiddles[eight].im = -root2Inv; twiddles[quarter+eight].re = -root2Inv; // three eights of way around (-1/root2, -i*1/root2) twiddles[quarter+eight].im = -root2Inv; int j; for(j=1; j<eight; j++) { double theta = (double)j * twoPiDivNsigned; // negative twiddles[j].re = cos(theta); // positive twiddles[j].im = sin(theta); // negative int pos = quarter-j; twiddles[pos].re = -twiddles[j].im; twiddles[pos].im = -twiddles[j].re; pos = quarter+j; twiddles[pos].re = twiddles[j].im; twiddles[pos].im = -twiddles[j].re; pos = (quarter << 1) - j; twiddles[pos].re = -twiddles[j].re; twiddles[pos].im = twiddles[j].im; } // for } else { // inverse FFT - N.B. uses positive exponent quarter = n >> 2; twiddles[quarter].re = 0; // quarter way around (0,i) twiddles[quarter].im = 1; // N.B. positive exponent eight = quarter >> 1; // eight of way around (1/root2, i*1/root2) twiddles[eight].re = root2Inv; twiddles[eight].im = root2Inv; twiddles[quarter+eight].re = -root2Inv; // three eights of way around (-1/root2, i*1/root2) twiddles[quarter+eight].im = root2Inv; int j; for(j=1; j<eight; j++) { double theta = (double)j * twoPiDivNsigned; // positive twiddles[j].re = cos(theta); // positive twiddles[j].im = sin(theta); // positive int pos = quarter-j; twiddles[pos].re = twiddles[j].im; twiddles[pos].im = twiddles[j].re; pos = quarter+j; twiddles[pos].re = -twiddles[j].im; twiddles[pos].im = twiddles[j].re; pos = (quarter << 1) - j; twiddles[pos].re = -twiddles[j].re; twiddles[pos].im = twiddles[j].im; } // for } // else } // fillTwiddles() /* * FFT PROCESSING */ // see fft.h for description void fftProc(fft_cpx* cin, const int lenP2, const int start, const int nextstart, const int oddadd, const int startP2, const fft_cpx* twiddles, const int toffset, const int twinc, const int normalise, const int arrange, const int realonly, const int secondpass) { /* * Input validation */ if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2)) { printf("\nfftProc():Invalid power of 2 for array length: %d, should be between %d and %d\n", lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2); return; } // if int length = 1 << lenP2; int subarraylen = nextstart - start; if((0 > start)||(0 > nextstart)||(0 >= subarraylen)) { printf("\nInvalid input sub-array indices, start index: %d, next start: %d, sub-array length %d\n", start, nextstart, subarraylen); return; } // if if ((start >= length)||(nextstart > length)) { printf("\nSub-array indices incompatible with array length, start index: %d, next start: %d, array length: %d\n", start, nextstart, length); return; } // if if (subarraylen & (subarraylen -1)) // non-zero if not power of 2 { printf("\nInvalid sub-array length, not a power of 2, start index: %d, next start: %d, sub-array length: %d\n", start, nextstart, subarraylen); return; } // if //printf("fftProc(): start %d, nextstart %d, oddadd %d, startP2 %d, toffset %d, tinc %d, realonly %d\n", // start,nextstart, oddadd, startP2, toffset, twinc, realonly); /* * Processing */ int bi; // index where this butterfly starts int ei; // index of even int oi; // index of odd fft_cpx temp; // holds current twiddles value int tstart = toffset; // position in twiddles array from which to begin int tindex; // index into twiddles array int tlimit; // tindex should be less than this int tinc = twinc; // increment used stepping through twiddles array int lasttinc = 1; // when tinc gets to this, on last butterfly int oddoffset; // amount to add to even index to get odd index in inputarray int butterflysize; // holds current butterfly size int bsh; // half of the butterfly size float norm; // used to hold normalizing divisor register float twre; // holds real part of twiddles entry register float twim; // holds imaginary part of twiddles entry fft_cpx * arranged = cin; fft_cpx * rain = (void *)FFT_RAIN; int * map = (void *)FFT_MAP; /* * If this is the second pass, do post processing of FFT result obtained so far * N.B. the FFT data being processed is in the second half of the underlying array, * the FFT result is put in the first half of the underlying array plus the Nyquist * component at index [length], the remaining entries in the underlying array * are not updated with negative frequencies, these are just the complex conjugates * of the positive frequencies. */ if(secondpass) { float value; int i = start; int j = (length << 1) - start; // if at start of twiddles array, avoid need to use value at index [2*length] if(0 == start) { value = (arranged[0].re + arranged[length].re + arranged[0].im + arranged[length].im)/2.0; arranged[0].im = (arranged[0].im - arranged[length].im - arranged[0].re + arranged[length].re)/2.0; arranged[0].re = value; i++; j--; } // do the remaining entries for(; i < nextstart; i++) { value = (arranged[i].re + arranged[j].re + twiddles[i].re*(arranged[i].im + arranged[j].im) + twiddles[i].im*(arranged[i].re - arranged[j].re))/2.0; arranged[i].im = (arranged[i].im - arranged[j].im + twiddles[i].im*(arranged[i].im + arranged[j].im) - twiddles[i].re*(arranged[i].re - arranged[j].re))/2.0; arranged[i].re = value; j--; } // for // Nyquist case if(length == nextstart) { arranged[length].re = arranged[length].re - arranged[length].im; arranged[length].im = 0; } // if return; } /* * Do FFT - only get here if this is not second pass * * N.B. If the original data was in-phase only, * the underlying array is assumed to be 2*length */ // if real only is true, adjust the twiddle array steps if(realonly) { tstart <<= 1; // twiddles was set up for full-length complex array tinc <<= 1; // rather than for half length reals only lasttinc <<= 1; } // if if(arrange) { int i; for(i = start; i < nextstart; i++) { rain[i] = arranged[map[i]]; } // for arranged = rain; } // if // double size of butterfly each time around outer loop for this sub-array for(butterflysize = 1 << startP2; butterflysize <= subarraylen; butterflysize <<= 1) { bsh = butterflysize >> 1; oddoffset = bsh + oddadd; if(lasttinc != tinc) // not last butterfly { // do all the butterflies in this sub-array for this size of butterfly for( bi = start; bi < nextstart; bi += butterflysize) { ei = bi; oi = ei + oddoffset; tindex = tstart; tlimit = tstart + bsh*tinc; // as log n repeats, worth doing first case separately if(0 == tindex) { tindex += tinc; temp.re = arranged[oi].re; temp.im = arranged[oi].im; arranged[oi].re = arranged[ei].re - temp.re; arranged[oi++].im = arranged[ei].im - temp.im; arranged[ei].re += temp.re; arranged[ei++].im += temp.im; } // if // doing each butterfly while(tindex < tlimit) { twre = twiddles[tindex].re; twim = twiddles[tindex].im; tindex += tinc; temp.re = arranged[oi].re*twre - arranged[oi].im*twim; temp.im = arranged[oi].re*twim + arranged[oi].im*twre; arranged[oi].re = arranged[ei].re - temp.re; arranged[oi++].im = arranged[ei].im - temp.im; arranged[ei].re += temp.re; arranged[ei++].im += temp.im; } // while } // for } else { // last butterfly, only one butterfly to do. Normalize if requested. ei = start; oi = ei + oddoffset; tindex = tstart; tlimit = tindex + lasttinc*bsh; // lasttinc will be 1 or 2 if(normalise) { norm = (float)(realonly ?(1 << (lenP2 + 1)): 1 << lenP2); // not worth doing single case separately // if(0 == tindex) // { // tindex += tinc; // // temp.re = arranged[oi].re; // temp.im = arranged[oi].im; // // arranged[oi].re = (arranged[ei].re - temp.re)/norm; // arranged[oi++].im = (arranged[ei].im - temp.im)/norm; // arranged[ei].re += temp.re; // arranged[ei].re /= 2.0; // arranged[ei].im += temp.im; // arranged[ei++].im /= 2.0; // } // if while(tindex < tlimit) { // twre = twiddles[tindex].re; // not faster // twim = twiddles[tindex].im; // // tindex += lasttinc; // // temp.re = arranged[oi].re*twre // - arranged[oi].im*twim; // temp.im = arranged[oi].re*twim // + arranged[oi].im*twre; temp.re = arranged[oi].re*twiddles[tindex].re - arranged[oi].im*twiddles[tindex].im; temp.im = arranged[oi].re*twiddles[tindex].im + arranged[oi].im*twiddles[tindex].re; tindex += lasttinc; arranged[oi].re = (arranged[ei].re - temp.re)/norm; arranged[oi++].im = (arranged[ei].im - temp.im)/norm; arranged[ei].re += temp.re; arranged[ei].re /= norm; arranged[ei].im += temp.im; arranged[ei++].im /= norm; } // while } else { // not normalizing // not worth doing single case separately // if(0 == tindex) // { // tindex += tinc; // // temp.re = arranged[oi].re; // temp.im = arranged[oi].im; // // arranged[oi].re = arranged[ei].re - temp.re; // arranged[oi++].im = arranged[ei].im - temp.im; // arranged[ei].re += temp.re; // arranged[ei++].im += temp.im; // } // if while(tindex < tlimit) { // twre = twiddles[tindex].re; // not faster // twim = twiddles[tindex].im; // // tindex += lasttinc; // // temp.re = arranged[oi].re*twre // - arranged[oi].im*twim; // temp.im = arranged[oi].re*twim // + arranged[oi].im*twre; temp.re = arranged[oi].re*twiddles[tindex].re - arranged[oi].im*twiddles[tindex].im; temp.im = arranged[oi].re*twiddles[tindex].im + arranged[oi].im*twiddles[tindex].re; tindex += lasttinc; arranged[oi].re = arranged[ei].re - temp.re; arranged[oi++].im = arranged[ei].im - temp.im; arranged[ei].re += temp.re; arranged[ei++].im += temp.im; } // while } // else // if original data was in-phase only, with no quadrature component, // make a copy of the FFT result to be ready for post-processing // N.B. the underlying array needs to be twice the current array size if(realonly) { int i,j,k,m; // copying originals into second half of underlying array j = start + length; m = start + oddoffset; k = start + oddoffset + length; for(i = start; i < nextstart; i++){ arranged[j].re = arranged[i].re; arranged[j++].im = arranged[i].im; arranged[k].re = arranged[m].re; arranged[k++].im = arranged[m++].im; } // for } // if } // else tinc >>= 1; // as the butterfly size is doubled the steps are halved } // for } // fftProc() // see fft.h for description void fftMulti(fft_cpx* cin, const int lenP2, const fft_cpx* twiddles, const int whichcpus, const int normalise, const int arrange, const int realonly) { /* * Input validation */ if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2)) { printf("\nfftMulti():Invalid power of 2 for array length: %d, should be between %d and %d\n", lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2); return; } // if if ((0 == whichcpus)||(whichcpus & ~CPUS_MASK)) { printf("\nInvalid choice of CPUs, input : 0x%x, must be between 0 and 0x%x\n", whichcpus, CPUS_MASK); return; } // if /* * Processing */ int length = 1 << lenP2; int blocksize; int numcpus = getNumCpus(whichcpus); int first,second,next; fft_cpx * use; if(arrange) { use = (void *)FFT_RAIN; } else { use = cin; } // else switch (numcpus) { case 1: blocksize = 1 << lenP2; // length // main processing giveToCpu(whichcpus, cin, lenP2, 0, blocksize, 0, 1, twiddles, 0, 1 << (lenP2-1), normalise, arrange, realonly,0); // MUST WAIT FOR THiS TO HAVE FINISHED fftWait(whichcpus); if(realonly) { giveToCpu(whichcpus, use, lenP2, 0, blocksize, 0, 1, twiddles, 0, 2, normalise, 0, realonly,1); } // if // MUST WAIT FOR THiS TO HAVE FINISHED fftWait(whichcpus); break; case 2: case 3: blocksize = 1 << (lenP2 - 1); // half of the length // get the CPU to use if (0 != (whichcpus & 8))first = 8; // 1xxx else if (0 != (whichcpus & 4))first = 4; // 01xx else first = 2; // 001x next = first >> 1; if (0 != (whichcpus & next))second = next; else if (0 != (whichcpus & (next >> 1)))second = (next >> 1); else second = 1; // main processing - most of the time is spent here giveToCpu(first, cin, lenP2, 0, blocksize, 0, 1, twiddles, 0, 1 << (lenP2-1), normalise, arrange, realonly, 0); giveToCpu(second, cin, lenP2, blocksize, length, 0, 1, twiddles, 0, 1 << (lenP2-1), normalise, arrange, realonly,0); // MUST WAIT FOR THESE TO HAVE FINISHED fftWait(first|second); // wait for both to finish // combine results giveToCpu(first, use, lenP2, 0, blocksize, blocksize >> 1, lenP2-1, twiddles, 0, 1, normalise, 0, realonly, 0); giveToCpu(second, use, lenP2, blocksize >> 1, (blocksize >> 1) + blocksize, blocksize >> 1, lenP2-1, twiddles, blocksize >> 1, 1, normalise, 0, realonly, 0); // MUST WAIT FOR THESE TO HAVE FINISHED fftWait(first|second); // wait for both to finish // complete the real only processing if(realonly) { giveToCpu(first, use, lenP2, 0, blocksize, 0, 1, twiddles, 0, 2, normalise, 0, realonly, 1); giveToCpu(second, use, lenP2, blocksize, length, 0, 1, twiddles, blocksize, 2, normalise, 0, realonly, 1); // MUST WAIT FOR THESE TO HAVE FINISHED fftWait(first|second); // wait for both to finish } // if break; case 4: blocksize = 1 << (lenP2 - 2); // quarter of the length // main processing - most of the time is spent here giveToCpu(8, cin, lenP2, 0, blocksize, 0, 1, twiddles, 0, 1 << (lenP2-1), normalise, arrange, realonly, 0); giveToCpu(4, cin, lenP2, blocksize, blocksize << 1, 0, 1, twiddles, 0, 1 << (lenP2-1), normalise, arrange, realonly, 0); giveToCpu(2, cin, lenP2, blocksize<<1, (blocksize << 1) + blocksize, 0, 1, twiddles, 0, 1 << (lenP2-1), normalise, arrange, realonly, 0); giveToCpu(1, cin, lenP2, (blocksize << 1) + blocksize, length, 0, 1, twiddles, 0, 1 << (lenP2-1), normalise, arrange, realonly, 0); // MUST WAIT FOR THESE TO HAVE FINISHED fftWait(0xf); // wait for all to finish // combine results giveToCpu(8, use, lenP2, 0, blocksize, blocksize>>1, lenP2 - 2, twiddles, 0, 2, normalise, 0, realonly, 0); giveToCpu(4, use, lenP2, blocksize>>1, (blocksize >> 1) + blocksize, blocksize >> 1, lenP2 -2, twiddles, blocksize, 2, normalise, 0, realonly, 0); giveToCpu(2, use, lenP2, blocksize<<1, 3*blocksize, blocksize >> 1, lenP2 -2, twiddles, 0, 2, normalise, 0, realonly, 0); giveToCpu(1, use, lenP2, (blocksize <<1) + (blocksize >> 1), 3 * blocksize + (blocksize >> 1), blocksize >> 1, lenP2 -2, twiddles, blocksize, 2, normalise, 0, realonly, 0); // MUST WAIT FOR THESE TO HAVE FINISHED fftWait(0xf); // wait for all to finish // combine again giveToCpu(8, use, lenP2, 0, blocksize, 3*(blocksize >> 1), lenP2-2, twiddles, 0, 1, normalise, 0, realonly, 0); giveToCpu(4, use, lenP2, blocksize >> 1,(blocksize >> 1) + blocksize, 3*(blocksize >> 1), lenP2-2, twiddles, blocksize>>1, 1, normalise, 0, realonly, 0); giveToCpu(2, use, lenP2, blocksize,(blocksize<<1), 3*(blocksize >> 1), lenP2-2, twiddles, blocksize, 1, normalise, 0, realonly, 0); giveToCpu(1, use, lenP2 , blocksize + (blocksize >> 1),(blocksize << 1) + (blocksize >>1), 3*(blocksize >> 1), lenP2-2, twiddles, 3*(blocksize >> 1), 1, normalise, 0, realonly, 0); // MUST WAIT FOR THESE TO HAVE FINISHED fftWait(0xf); // wait for all to finish // complete the real only processing if(realonly) { giveToCpu(8, use, lenP2, 0, blocksize, 0, 1, twiddles, 0, 2, normalise, 0, realonly, 1); giveToCpu(4, use, lenP2, blocksize, blocksize << 1, 0, 1, twiddles, 0, 2, normalise, 0, realonly, 1); giveToCpu(2, use, lenP2, blocksize<<1, 3*blocksize, 0, 1, twiddles, 0, 2, normalise, 0, realonly, 1); giveToCpu(1, use, lenP2 , 3*blocksize,blocksize << 2, 0, 1, twiddles, 0, 2, normalise, 0, realonly, 1); // MUST WAIT FOR THESE TO HAVE FINISHED fftWait(first|second); // wait for both to finish } // if break; } // switch } // fftMulti() // see fft.h for description void giveToCpu(const int which, // 1,2,4 or 8 fft_cpx* cin, const int lenP2, // input array const int begin, const int nextbegin, // part indices const int oddadd, const int startP2, const fft_cpx* twiddles, const int toffset, const int twinc, const int normalise, const int arrange, const int realonly, const int secondpass) { if(0 == which) return; // nothing to do this time if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2)) { printf("\ngiveToCpu(): Invalid power of 2 for array length: %d, should be from %d to %d\n", lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2); return; } // if // get the index into the FFT_FLAGS int cpuindex; if(1 == which) cpuindex = 0; else if(2 == which) cpuindex = 1; else if(4 == which) cpuindex = 2; else if(8 == which) cpuindex = 3; else return; volatile fft_cpu *thisCpu = (void *)(FFT_FLAGS + cpuindex*sizeof(fft_cpu)); thisCpu->cinadrs = (unsigned int)cin; thisCpu->lenP2 = (unsigned int)lenP2; thisCpu->begin = (unsigned int)begin; thisCpu->nextbegin = (unsigned int)nextbegin; thisCpu->oddadd = (unsigned int)oddadd; thisCpu->startP2 = (unsigned int)startP2; thisCpu->twadrs = (unsigned int)twiddles; thisCpu->toffset = (unsigned int)toffset; thisCpu->tinc = (unsigned int)twinc; thisCpu->normalise = (unsigned int)normalise; thisCpu->arrange = (unsigned int)arrange; thisCpu->realonly = (unsigned int)realonly; thisCpu->secondpass = (unsigned int)secondpass; thisCpu->spare2 = 0xa5a5a5a5; // for debug thisCpu->status = (unsigned int)FFT_START; // do this last // if the CPU is the one running this code, do the FFT here if(START_CPU_INDEX == cpuindex) { thisCpu->status |= FFT_START; thisCpu->spare1 = 0xbabecafe; // for debug fftProc((void *)thisCpu->cinadrs, thisCpu->lenP2, thisCpu->begin, thisCpu->nextbegin, thisCpu->oddadd, thisCpu->startP2, (void *)thisCpu->twadrs, thisCpu->toffset, thisCpu->tinc, thisCpu->normalise, thisCpu->arrange, thisCpu->realonly, thisCpu->secondpass); thisCpu->spare2 = 0xcafebabe; // for debug thisCpu->status &= ~FFT_WORK_MASK; // finished } else { *((int *)IRQMP_ADRS) = 0x380b0000 | which; } // else } // giveToCpu() // see fft.h for description void fftWait(int whichcpus){ // other CPUS loop forever, check for work, do it, set done status volatile fft_cpu *cpu0 = (void *)(FFT_FLAGS); volatile fft_cpu *cpu1 = (void *)(FFT_FLAGS + sizeof(fft_cpu)); volatile fft_cpu *cpu2 = (void *)(FFT_FLAGS + 2*sizeof(fft_cpu)); volatile fft_cpu *cpu3 = (void *)(FFT_FLAGS + 3*sizeof(fft_cpu)); // volatile fft_cpu * cpus = (void *)FFT_FLAGS; int notdone = 0; do { notdone = (8 & whichcpus)? FFT_WORK_MASK & cpu3->status : 0; notdone |= (4 & whichcpus)? FFT_WORK_MASK & cpu2->status : 0; notdone |= (2 & whichcpus)? FFT_WORK_MASK & cpu1->status: 0; notdone |= (1 & whichcpus)? FFT_WORK_MASK & cpu0->status: 0; } while (notdone); return; } // fftWait() // see fft.h for description void fftRealReturn(fft_cpx * cin, const int halflenP2, fft_cpx * twiddles) { fft_scalar value; int i,j; int length = 1 << halflenP2; // results overwrite originals that are needed subsequently, // so copy originals into second half of underlying array j = length; for(i =0; i <= length; i++){ cin[j].re = cin[i].re; cin[j++].im = cin[i].im; } // for j = (length << 1) +1; for(i = 0; i < length; i++) { j--; value = (cin[i].re + cin[j].re + twiddles[i].re*(cin[i].im + cin[j].im) + twiddles[i].im*(cin[i].re - cin[j].re))/2.0; cin[i].im = (cin[i].im - cin[j].im + twiddles[i].im*(cin[i].im + cin[j].im) - twiddles[i].re*(cin[i].re - cin[j].re))/2.0; cin[i].re = value; } // for // Nyquist case // if they existed, twiddles[length].re would be -1, twiddles[length].im would be 0; cin[length].re = cin[length].re - cin[length].im; cin[length].im = 0; // copy positive frequency results to negative, doing complex conjugate for(i = 1; i < length ; i++) { cin[length+i].re = cin[length-i].re; cin[length+i].im = -cin[length-i].im; } // for return; } // fftRealReturn() /* * UTILITIES */ // see fft.h for description int getNumCpus(const int whichcpus) { // set up count of cpus to use int numcpus = 0; // how many cpus to use numcpus += (whichcpus & 1)? 1 : 0; numcpus += (whichcpus & 2)? 1 : 0; numcpus += (whichcpus & 4)? 1 : 0; numcpus += (whichcpus & 8)? 1 : 0; return numcpus; } // getNumCpus // see fft.h for description int settingsIncorrect(const int lenP2, int *cpusrequest, const int numruns, const int type, int* lines) { int result = 0; // array size is limited by the way memory is being allocated at present if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2)) { printf("\nInvalid power of 2 for array length: %d, should be between %d and %d\n", lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2); result |= 1; } // if int whichcpus = *cpusrequest; if((0 == whichcpus)||(whichcpus & ~CPUS_MASK)) { printf("\nInvalid choice of CPUs: 0x%x, must be from 1 to 0x%x\n", whichcpus, CPUS_MASK); result |= 1; } // if // get the count of cpus being requested int numcpus = getNumCpus(whichcpus); // only allow 1, 2 or 4 cpus, if 3 deselect the lowest order one if(3 == numcpus) { if (whichcpus & 1) whichcpus &= 0xe; // deselect CPU 0 else whichcpus &= 0xd; // deselect CPU 1 *cpusrequest = whichcpus; numcpus = 2; printf("\nChoice of 3 CPUs invalid, will use these 2 CPUs : 0x%x\n", whichcpus); } // if // find how many CPUs are present on system volatile int status = *(int *)IRQMP_ADRS; // current system configuration int statuscpus = (((status & CPUS_PM1_MASK)>>CPUS_PM1_SHIFTS)&0xf) + 1; printf("\nMain, Status: 0x%x, CPUs present %d, CPUs requested: %d, WhichCPUs: 0x%x\n", status,statuscpus,numcpus,whichcpus); // debug if((0 == numcpus)||(numcpus > statuscpus)) // sanity check { printf("Abandoning, number of CPUs requested: %d, number present %d",numcpus, statuscpus); result |= 1; } // if // check the type if((0 != type)&&(1 != type)) { printf("Abandoning, invalid type input: %d, should be 0 for forward FFT or 1 for inverse FFT\n",type); result |= 1; } // if // check the choice of number of lines to display if(0 >= *lines) { printf("Number of lines chosen was %d, no data or result will be displayed\n",*lines); } else if ((1 << lenP2) < *lines){ printf("Number of lines chosen %d exceeds array length %d, array length will be used\n",*lines,1<<lenP2); *lines = 1 << lenP2; } return result; } // settingsIncorrect() // see fft.h for description void showC(const fft_cpx* in, const int length, const double tolerance) { int i=0; float temp1 = 0.0; float temp2 = 0.0; printf("Complex numbers array, showing first %d elements\n", length); for (i=0; i<length; i++) { temp1 = in[i].re; if( ((0.0 < temp1)&&( tolerance > temp1)) ||((0.0 > temp1)&&(-tolerance < temp1))){ temp1 = 0.0; } // if temp2 = in[i].im; if( ((0 < temp2)&&( tolerance > temp2)) ||((0 > temp2)&&(-tolerance < temp2))){ temp2 = 0.0; } // if printf("%d: %g %g\n", i, temp1, temp2); } // for } // showC() // see fft.h for description void show8C(const fft_cpx* in, const int length, const double tolerance, const int realonly) { if(32 > length) { showC(in,length,tolerance); } // if int i,k; fft_cpx temp; float temp1 = 0.0; float temp2 = 0.0; printf("Complex numbers array, showing certain elements\n\n"); int nyindex = length>>1; int begins[4] = {0, nyindex - 8, nyindex, length-8}; for(k =0; k<4; k++) { for (i=0; i<8; i++) { temp = in[begins[k]+i]; temp1 = temp.re; if( ((0.0 < temp1)&&( tolerance > temp1)) ||((0.0 > temp1)&&(-tolerance < temp1))){ temp1 = 0.0; } // if temp2 = temp.im; if( ((0 < temp2)&&( tolerance > temp2)) ||((0 > temp2)&&(-tolerance < temp2))){ temp2 = 0.0; } // if printf("%d:\t\t\t%g %g\n", begins[k]+i, temp1, temp2); if((0 == begins[k]+i)||(nyindex == begins[k]+i)) printf("\n"); if(realonly && (nyindex == begins[k] + i)) { printf("In-phase data only, last frequency shown above is Nyquist\n"); printf("Negative frequencies not shown (complex conjugates of positive frequencies)\n\n"); break; // for loop } // if } // for printf("\n"); if(realonly && (2 == k)) break; } // for } // show8C()