/* -*- C -*- */
/*
 * Copyright (c) 2007 Massachusetts Institute of Technology
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */

#include "fftw-spu.h"
#include "../fftw-cell.h"

static void build_plan(struct spu_plan *p,
		       int n, R *W, const struct spu_radices *pp)
{
     int r;
     const signed char *q;

     /* t-codelets */
     for (q = pp->r; (r = *q) > 0; ++q, ++p) {
	  n /= r;
	  p->r = r;
	  p->m = n;
	  p->W = W;
	  W += 2 * (r - 1) * n;
     }

     /* final n-codelet */
     p->r = r;
}

static void flip_ri(R *A, int ncomplex)
{
     int i;
     for (i = 0; i < ncomplex - 7 * VL; i += 8 * VL) {
	  V x0 = LD(A + 2 * i + 0 * VL, 0, 0);
	  V x1 = LD(A + 2 * i + 2 * VL, 0, 0);
	  V x2 = LD(A + 2 * i + 4 * VL, 0, 0);
	  V x3 = LD(A + 2 * i + 6 * VL, 0, 0);
	  V x4 = LD(A + 2 * i + 8 * VL, 0, 0);
	  V x5 = LD(A + 2 * i + 10 * VL, 0, 0);
	  V x6 = LD(A + 2 * i + 12 * VL, 0, 0);
	  V x7 = LD(A + 2 * i + 14 * VL, 0, 0);
	  x0 = FLIP_RI(x0);
	  x1 = FLIP_RI(x1);
	  x2 = FLIP_RI(x2);
	  x3 = FLIP_RI(x3);
	  x4 = FLIP_RI(x4);
	  x5 = FLIP_RI(x5);
	  x6 = FLIP_RI(x6);
	  x7 = FLIP_RI(x7);
	  ST(A + 2 * i + 0 * VL, x0, 0, 0);
	  ST(A + 2 * i + 2 * VL, x1, 0, 0);
	  ST(A + 2 * i + 4 * VL, x2, 0, 0);
	  ST(A + 2 * i + 6 * VL, x3, 0, 0);
	  ST(A + 2 * i + 8 * VL, x4, 0, 0);
	  ST(A + 2 * i + 10 * VL, x5, 0, 0);
	  ST(A + 2 * i + 12 * VL, x6, 0, 0);
	  ST(A + 2 * i + 14 * VL, x7, 0, 0);
     }
     for (; i < ncomplex; i += VL) {
	  V x0 = LD(A + 2 * i, 0, 0);
	  x0 = FLIP_RI(x0);
	  ST(A + 2 * i, x0, 0, 0);
     }
}

static void bytwiddle(R *A, int r, int chunk, R *buf,
		      int m, int dm, unsigned long long Wppu0)
{
     int v, i;
     int mold = -1;  /* assert (mold != m) */
     R *W = 0;

     for (v = 0; v < chunk; ++v, m += dm, A += 2 * r) {
	  /* load twiddles if we don't have them already */
	  if (mold != m) {
	       unsigned long long Wppu = Wppu0 + m * r * 2 * sizeof(R);
	       W = ALIGN_LIKE(buf, Wppu);
	       X(spu_dma1d)(W, Wppu, r * 2 * sizeof(R), MFC_GET_CMD);
	       mold = m;
	  }
	  for (i = 0; i < r - 3 * VL; i += 4 * VL) {
	       V a0 = LD(A + 2 * (i + 0 * VL), 0, 0);
	       V a1 = LD(A + 2 * (i + 1 * VL), 0, 0);
	       V a2 = LD(A + 2 * (i + 2 * VL), 0, 0);
	       V a3 = LD(A + 2 * (i + 3 * VL), 0, 0);
	       V b0 = BYTWJ(W + 2 * (i + 0 * VL), a0);
	       V b1 = BYTWJ(W + 2 * (i + 1 * VL), a1);
	       V b2 = BYTWJ(W + 2 * (i + 2 * VL), a2);
	       V b3 = BYTWJ(W + 2 * (i + 3 * VL), a3);
	       ST(A + 2 * (i + 0 * VL), b0, 0, 0);
	       ST(A + 2 * (i + 1 * VL), b1, 0, 0);
	       ST(A + 2 * (i + 2 * VL), b2, 0, 0);
	       ST(A + 2 * (i + 3 * VL), b3, 0, 0);
	  }
	  for (; i < r; i += VL) {
	       V a0 = LD(A + 2 * i, 0, 0);
	       V b0 = BYTWJ(W + 2 * i, a0);
	       ST(A + 2 * i, b0, 0, 0);
	  }
     }
}

/* compute DFT of contiguous arrays (is = os = 2) */
void X(spu_do_dft)(const struct dft_context *dft)
{
     int v0, v1, vv;
     int chunk;
     const int chunkalign = (ALIGNMENT / (2 * sizeof(R)));
     struct spu_plan plan[MAX_PLAN_LEN];
     R *W, *A, *Aalign, *Balign;
     int n = dft->n;
     int twon = 2 * n;
     int nbytes = twon * sizeof(R);
     const struct cell_iodim *vd = dft->v;
     unsigned long long xi, xo;
     int dm0 = vd[0].dm, dm1 = vd[1].dm;

     X(spu_alloc_reset)();

     /* obtain twiddle factors */
     W = X(spu_alloc)(dft->Wsz_bytes + ALIGNMENT);
     W = ALIGN_LIKE(W, dft->W);
     X(spu_dma1d)(W, dft->W, dft->Wsz_bytes, MFC_GET_CMD);
     
     /* build plan */
     build_plan(plan, dft->n, W, &dft->r);

     if (dft->is_bytes == 2 * sizeof(R) && dft->os_bytes == 2 * sizeof(R)) {
	  chunk = (X(spu_alloc_avail)() - nbytes - 2 * ALIGNMENT) / nbytes;
     } else {
	  /* CHUNK must be a multiple of VL for transposed DMA */
	  chunk = VL * 
	       ((X(spu_alloc_avail)() - nbytes - 2 * ALIGNMENT) /
		(VL * nbytes));
     }

     /* align CHUNK, but only if CHUNK is less than the whole dimension	0,
	since the invoking code may be relying upon the whole chunk to
	fit into local store.  Besides, there is no point to align
	CHUNK in this case anyway. */
     if (vd[0].n1 > vd[0].n0 + chunk && chunk > chunkalign)
	  chunk &= ~(chunkalign-1);

     A = X(spu_alloc)((chunk + 1) * nbytes + 2 * ALIGNMENT);

     /* for all vector elements we are responsible for: */
     for (v1 = vd[1].n0; v1 < vd[1].n1; v1++) {
	  xi = dft->xi + v1 * vd[1].is_bytes;
	  xo = dft->xo + v1 * vd[1].os_bytes;
	  for (v0 = vd[0].n0; v0 < vd[0].n1; v0 += chunk) {
	       int m = v0 * dm0 + v1 * dm1;

	       if (chunk > vd[0].n1 - v0)
		    chunk = vd[0].n1 - v0;

	       Aalign = ALIGN_LIKE(A + ALIGNMENT/sizeof(R) + twon, 
				   xi + v0 * vd[0].is_bytes);
	       Balign = ALIGN_LIKE(A, xo + v0 * vd[0].os_bytes);

	       /* obtain data */
	       X(spu_dma2d)(Aalign, xi + v0 * vd[0].is_bytes,
			    n, /* 2, */ dft->is_bytes,
			    chunk, /* twon, */ vd[0].is_bytes,
			    MFC_GET_CMD);

	       if (dft->sign != FFT_SIGN)
		    flip_ri(Aalign, n * chunk);

	       if (m > 0 || dm0 > 0)
		    bytwiddle(Aalign, n, chunk, A, m, dm0, dft->Ww);

	       /* compute FFTs */
	       for (vv = 0; vv < chunk; ++vv) 
		    X(spu_execute_plan)(plan,
					Aalign + vv * twon, 
					Balign + vv * twon);

	       if (m < 0 || dm0 < 0) 
		    bytwiddle(Balign, n, chunk, Balign + twon * chunk,
			      -m, -dm0, dft->Ww);

	       if (dft->sign != FFT_SIGN)
		    flip_ri(Balign, n * chunk);

	       /* write data back */
	       X(spu_dma2d)(Balign, xo + v0 * vd[0].os_bytes,
			    n, /* 2, */ dft->os_bytes,
			    chunk, /* twon, */ vd[0].os_bytes,
			    MFC_PUT_CMD);
	  }
     }
}