/* -*- C -*- */ /* * Copyright (c) 2007 Massachusetts Institute of Technology * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #include "fftw-spu.h" #include "../fftw-cell.h" #define BLOCKSZ 64 void X(spu_do_transpose)(const struct transpose_context *t) { int i, j, ni, nj; int n = t->n, nspe = t->nspe, my_id = t->my_id; int s1_bytes = t->s1_bytes; int s0_bytes = t->s0_bytes; int nblock; R *A, *B, *Aalign, *Balign; X(spu_alloc_reset)(); A = X(spu_alloc)(BLOCKSZ * BLOCKSZ * 2 * sizeof(R) + ALIGNMENT); B = X(spu_alloc)(BLOCKSZ * BLOCKSZ * 2 * sizeof(R) + ALIGNMENT); nblock = 0; ni = BLOCKSZ; for (i = 0; i < n; i += ni) { if (ni > n - i) ni = n - i; nj = BLOCKSZ; for (j = i; j < n; j += nj) { if (nj > n - j) nj = n - j; if ((nblock++ % nspe) != my_id) continue; /* block is not ours */ if (i == j) { /* diagonal block */ Aalign = ALIGN_LIKE(A, t->A + (i * s1_bytes + j * s0_bytes)); X(spu_dma2d)(Aalign, t->A + (i * s1_bytes + j * s0_bytes), nj, s0_bytes, ni, s1_bytes, MFC_GET_CMD); X(spu_dma2d)(Aalign, t->A + (i * s1_bytes + j * s0_bytes), ni, s1_bytes, nj, s0_bytes, MFC_PUT_CMD); } else /* if (j > i) */ { Aalign = ALIGN_LIKE(A, t->A + (i * s1_bytes + j * s0_bytes)); Balign = ALIGN_LIKE(B, t->A + (j * s1_bytes + i * s0_bytes)); X(spu_dma2d)(Aalign, t->A + (i * s1_bytes + j * s0_bytes), nj, s0_bytes, ni, s1_bytes, MFC_GET_CMD); X(spu_dma2d)(Balign, t->A + (j * s1_bytes + i * s0_bytes), ni, s0_bytes, nj, s1_bytes, MFC_GET_CMD); X(spu_dma2d)(Aalign, t->A + (j * s1_bytes + i * s0_bytes), nj, s1_bytes, ni, s0_bytes, MFC_PUT_CMD); X(spu_dma2d)(Balign, t->A + (i * s1_bytes + j * s0_bytes), ni, s1_bytes, nj, s0_bytes, MFC_PUT_CMD); } } } }