/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */

/*
 * Disclaimer of Warranty
 *
 * These software programs are available to the user without any license fee or
 * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
 * any and all warranties, whether express, implied, or statuary, including any
 * implied warranties or merchantability or of fitness for a particular
 * purpose.  In no event shall the copyright-holder be liable for any
 * incidental, punitive, or consequential damages of any kind whatsoever
 * arising from the use of these programs.
 *
 * This disclaimer of warranty extends to the user of these programs and user's
 * customers, employees, agents, transferees, successors, and assigns.
 *
 * The MPEG Software Simulation Group does not represent or warrant that the
 * programs furnished hereunder are free of infringement of any third-party
 * patents.
 *
 * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
 * are subject to royalty fees to patent holders.  Many of these patents are
 * general enough such that they are unavoidable regardless of implementation
 * design.
 *
 */

/**********************************************************/
/* inverse two dimensional DCT, Chen-Wang algorithm       */
/* (cf. IEEE ASSP-32, pp. 803-816, Aug. 1984)             */
/* 32-bit integer arithmetic (8 bit coefficients)         */
/* 11 mults, 29 adds per DCT                              */
/*                                      sE, 18.8.91       */
/**********************************************************/
/* coefficients extended to 12 bit for IEEE1180-1990      */
/* compliance                           sE,  2.1.94       */
/**********************************************************/

/*
 * adapted for ProjectX by Pedro A. Aranda <paaguti@sourceforge.net>
 */

#include <stdio.h>
#include <inttypes.h>
#include "idct.h"
#include "mmx.h"

#define ROOT2OVER2 0.70710678118654757

#define W0 1
#define W1 1.3870398453221475  /* sqrt(2)*cos(1*pi/16) */
#define W2 1.3065629648763766  /* sqrt(2)*cos(2*pi/16) */
#define W3 1.1758756024193588  /* sqrt(2)*cos(3*pi/16) */
#define W4 1
#define W5 0.78569495838710235 /* sqrt(2)*cos(5*pi/16) */
#define W6 0.54119610014619712 /* sqrt(2)*cos(6*pi/16) */
#define W7 0.27589937928294311 /* sqrt(2)*cos(7*pi/16) */

#define WIFY(b,v) ((int)((b)*(v)+.5))

#define Wr 16384
#define Wr0 WIFY(Wr,W0)
#define Wr1 WIFY(Wr,W1)
#define Wr2 WIFY(Wr,W2)
#define Wr3 WIFY(Wr,W3)
#define Wr4 WIFY(Wr,W4)
#define Wr5 WIFY(Wr,W5)
#define Wr6 WIFY(Wr,W6)
#define Wr7 WIFY(Wr,W7)

#define Wrshift 11
#define Wrround (1<<((Wrshift)-1))

#define Wc 16384
#define Wc0 WIFY(Wc,W0)
#define Wc1 WIFY(Wc,W1)
#define Wc2 WIFY(Wc,W2)
#define Wc3 WIFY(Wc,W3)
#define Wc4 WIFY(Wc,W4)
#define Wc5 WIFY(Wc,W5)
#define Wc6 WIFY(Wc,W6)
#define Wc7 WIFY(Wc,W7)

#define Wcshift 20
#define Wcround (1<<((Wcshift)-1))

// NOTE: Wr*Wc == (1<<(Wrshift+Wcshift))
// also, Wr <= 8<<Wrshift (seems to be a good relationship to keep the intermediate matrix in bounds

static int16_t idct_mmx_row_table[32]={
    Wr0,  -Wr4,  Wr0, -Wr4,
    Wr4,   Wr0,  Wr4,  Wr0,
    -Wr2,  Wr6, -Wr2,  Wr6,
    Wr6,   Wr2,  Wr6,  Wr2,
    Wr5,   Wr3,  Wr5,  Wr3,
    Wr3,  -Wr5,  Wr3, -Wr5,
    Wr1,   Wr7,  Wr1,  Wr7,
    Wr7,  -Wr1,  Wr7, -Wr1
};

#if Wr==Wc
#define idct_mmx_col_table idct_mmx_row_table
#else
static int16_t idct_mmx_row_table[32]={
    Wc0,  -Wc4,  Wc0, -Wc4,
    Wc4,   Wc0,  Wc4,  Wc0,
    -Wc2,  Wc6, -Wc2,  Wc6,
    Wc6,   Wc2,  Wc6,  Wc2,
    Wc5,   Wc3,  Wc5,  Wc3,
    Wc3,  -Wc5,  Wc3, -Wc5,
    Wc1,   Wc7,  Wc1,  Wc7,
    Wc7,  -Wc1,  Wc7, -Wc1
};
#endif

static int32_t idct_mmx_row_round[2]={
    Wrround, Wrround
};

static int32_t idct_mmx_col_round[2]={
    Wcround, Wcround
};

#define RPT4(x) x, x, x, x
#define SSEMAT(A,B,C,D) RPT4((A)/(B)), RPT4(((D)*(A))/(C)-(B)), RPT4((B)), RPT4((C)/(A))
#define ALIGN_PTR(x,a) ((void *)( (((size_t)(x))+(a)-1)&(-(a))))
#define SHUFFLEMAP(A,B,C,D) ((A)*1+(B)*4+(C)*16+(D)*64)

// computes A, B = A*x[0] + B*x[1], A*x[4] + B*x[5]
#define MULTADD(A,B,x) { int t=(A)*(x)[0]+(B)*(x)[1]; (B)=(A)*(x)[4]+(B)*(x)[5]; (A)=t; }
/*
  on register starved platforms, it can be exected as:

  x, y = A*x + B*y, C*x + D*y;

  x*=I;  I*x, y
  x+=y;  I*x + y, y
  y*=K;  I*x + y, K*y
  x*=B;  B*I*x + B*y, K*y
  y+=x;  B*I*x + B*y, A*x + (B+K)*y
  y*=L;  B*I*x + B*y, L*A*x + L*(B+K)*y
  
  I = A/B
  K = (D*A)/C-B
  B
  L = C/A
*/
#define MMXMULTADD(x,y,t)       \
        movq_r2r(x, y);         \
        pmaddwd_m2r((t)[0], x); \
        pmaddwd_m2r((t)[4], y);

#define SSEMULTADD(x,y,t)  \
      mulps_m2r((t)[0], x);  \
      addps_r2r(y, x);     \
      mulps_m2r((t)[4], y);  \
      mulps_m2r((t)[8], x);  \
      addps_r2r(x, y);     \
      mulps_m2r((t)[12], y);


// computes A, B = A-B, A+B;
#define ADDDIFF(A,B) { int t=(A); (A)-=(B); (B)+=t; }
/*
  on register starved platforms, it can be executed as:

  A-=B;  A-B, B
  B+=B;  A-B, 2B
  B+=A;  A-B, A+B
*/
#define MMXADDDIFF(x,y)  \
        psubd_r2r(y, x); \
        paddd_r2r(y, y); \
        paddd_r2r(x, y);

#define SSEADDDIFF_t(A,B,C) \
        movaps_r2r(A,C);    \
        subps_r2r(B,A);     \
        addps_r2r(C,B);

#define SSEADDDIFF(x,y)  \
        subps_r2r(y, x); \
        addps_r2r(y, y); \
        addps_r2r(x, y);

// the MMX and SSE versions effectively implement this for idctrow:

/*
        x0 = src[0];
        x1 = src[4];
        x2 = src[6];
        x3 = src[2];
        x4 = src[5];
        x5 = src[3];
        x6 = src[1];
        x7 = src[7];

        // first stage
        
        MULTADD(x0, x1, idct_mmx_row_table);
        x0+=Wrround;
        x1+=Wrround;
        
        MULTADD(x2, x3, idct_mmx_row_table+8);

        MULTADD(x4, x5, idct_mmx_row_table+16);

        MULTADD(x6, x7, idct_mmx_row_table+24);
  
        // second stage
        ADDDIFF(x6, x4);

        ADDDIFF(x7, x5);
  
        // third stage
        ADDDIFF(x1, x3);

        ADDDIFF(x0, x2);

        ADDDIFF(x6, x7);

        // fourth stage
        ADDDIFF(x3, x4);

        ADDDIFF(x1, x5);

        dst[0] = (x4)>>Wrshift;
        dst[3] = (x5)>>Wrshift;
        dst[4] = (x1)>>Wrshift;
        dst[7] = (x3)>>Wrshift;

        x7 = MUL_BY_ROOT_2_OVER_2(x7);
        x6 = MUL_BY_ROOT_2_OVER_2(x6);
  
        ADDDIFF(x2, x7);
        ADDDIFF(x0, x6);

        dst[1] = (x7)>>Wrshift;
        dst[2] = (x6)>>Wrshift;
        dst[5] = (x0)>>Wrshift;
        dst[6] = (x2)>>Wrshift;
*/

void idct_mmx(int16_t *block)
{
    int16_t temp[64], *src, *dst;
    int i;

    src=block;
    dst=temp;
    for( i=0; i<4; src+=16, dst+=2, i++ ) {

#define STOREMM(r, d, s)    \
        psrad_i2r(s, r);    \
        packssdw_r2r(r, r); \
        movd_r2m(r, d);


        /* first stage */
        // x0, x1 =  W0*x0 + W1*x1 + 128, W1*x0 - W0*x1 + 128;
        // x2, x3 = -W2*x2 + W6*x3, W6*x2 + W2*x3;
        // x4, x5 =  W1*x4 + W7*x5, W7*x4 - W1*x5;    
        // x6, x7 =  W5*x6 + W3*x7, W3*x6 - W5*x7;
    

        movq_m2r(src[0], mm0);
        movq_m2r(src[8], mm1);
        movq_r2r(mm0, mm2);
        punpckldq_r2r(mm1, mm0);
        punpckhdq_r2r(mm1, mm2);

        movq_m2r(src[4], mm4);
        movq_m2r(src[12], mm5);
        movq_r2r(mm4, mm6);
        punpckldq_r2r(mm5, mm4);
        punpckhdq_r2r(mm5, mm6);

        // mm5 = low word set
        // mm7 = high word set
        pxor_r2r(mm5, mm5);
        pcmpeqw_r2r(mm5, mm5);
        movq_r2r(mm5, mm7);
        psrld_i2r(16, mm5);
        pxor_r2r(mm5, mm7);

        movq_r2r(mm4, mm1);

        // 0,1 / 2,3 / 4,5 / 6,7 / 1=4,5
        movq_r2r(mm0, mm3);
        pand_r2r(mm5, mm0);
        pslld_i2r(16, mm1);
        por_r2r(mm1, mm0);

        // 0,4 / 2,3 / 4,5 / 6,7 / 3=0,1
        movq_r2r(mm6, mm1);
        pand_r2r(mm7, mm6);
        psrld_i2r(16, mm3);
        por_r2r(mm3, mm6);

        // 0,4 / 2,3 / 4,5 / 1,7 / 1=6,7
        movq_r2r(mm2, mm3);
        pslld_i2r(16, mm2);
        pand_r2r(mm5, mm1);
        por_r2r(mm1, mm2);

        // 0,4 / 6,2 / 4,5 / 1,7 / 3=2,3
        psrld_i2r(16, mm4);
        pand_r2r(mm7, mm3);
        por_r2r(mm3, mm4);

        // 0,4 / 6,2 / 5,3 / 1,7           

        MMXMULTADD(mm0, mm1, idct_mmx_row_table);
        MMXMULTADD(mm2, mm3, idct_mmx_row_table+8);
        MMXMULTADD(mm4, mm5, idct_mmx_row_table+16);
        MMXMULTADD(mm6, mm7, idct_mmx_row_table+24);

        paddd_m2r(idct_mmx_row_round[0], mm0);
        paddd_m2r(idct_mmx_row_round[0], mm1);

        /* second stage */

        MMXADDDIFF(mm6, mm4);    
        MMXADDDIFF(mm7, mm5);
        
        /* third stage */
        
        MMXADDDIFF( mm1, mm3 );
        MMXADDDIFF( mm0, mm2 );
        MMXADDDIFF( mm6, mm7 );
    
        /* fourth stage */

        MMXADDDIFF( mm3, mm4 );
        MMXADDDIFF( mm1, mm5 );

        /* fifth stage */

        STOREMM(mm4, dst[0*8], Wrshift);
        STOREMM(mm5, dst[3*8], Wrshift);
        STOREMM(mm1, dst[4*8], Wrshift);
        STOREMM(mm3, dst[7*8], Wrshift);

        /* sixth stage */
        // x6 = (181*x6+128)>>8;
        // x7 = (181*x7+128)>>8;
      
        // actually, this computes, roughly: x6 -= (x6>>8)*75
        movq_r2r(mm6, mm4);
        movq_r2r(mm7, mm5);

        psrad_i2r(2, mm6);
        psrad_i2r(2, mm7);
        psubd_r2r(mm6, mm4);
        psubd_r2r(mm7, mm5);

        psrad_i2r(3, mm6);
        psrad_i2r(3, mm7);
        psubd_r2r(mm6, mm4);
        psubd_r2r(mm7, mm5);

        psrad_i2r(2, mm6);
        psrad_i2r(2, mm7);
        psubd_r2r(mm6, mm4);
        psubd_r2r(mm7, mm5);

        psrad_i2r(1, mm6);
        psrad_i2r(1, mm7);
        psubd_r2r(mm6, mm4);
        psubd_r2r(mm7, mm5);
    
        /* seventh stage */

        MMXADDDIFF( mm2, mm5 );
        MMXADDDIFF( mm0, mm4 );
        
        /* eighth stage */

        STOREMM(mm5, dst[1*8], Wrshift);
        STOREMM(mm4, dst[2*8], Wrshift);
        STOREMM(mm0, dst[5*8], Wrshift);
        STOREMM(mm2, dst[6*8], Wrshift);
    }

    src=temp;
    dst=block;
    for( i=0; i<4; src+=16, dst+=2, i++ ) {

        /* first stage */
        // x0, x1 =  W0*x0 + W1*x1 + 128, W1*x0 - W0*x1 + 128;
        // x2, x3 = -W2*x2 + W6*x3, W6*x2 + W2*x3;
        // x4, x5 =  W1*x4 + W7*x5, W7*x4 - W1*x5;    
        // x6, x7 =  W5*x6 + W3*x7, W3*x6 - W5*x7;
    
        movq_m2r(src[0], mm0);
        movq_m2r(src[8], mm1);
        movq_r2r(mm0, mm2);
        punpckldq_r2r(mm1, mm0);
        punpckhdq_r2r(mm1, mm2);

        movq_m2r(src[4], mm4);
        movq_m2r(src[12], mm5);
        movq_r2r(mm4, mm6);
        punpckldq_r2r(mm5, mm4);
        punpckhdq_r2r(mm5, mm6);

        // mm5 = low word set
        // mm7 = high word set
        pxor_r2r(mm5, mm5);
        pcmpeqw_r2r(mm5, mm5);
        movq_r2r(mm5, mm7);
        psrld_i2r(16, mm5);
        pxor_r2r(mm5, mm7);

        movq_r2r(mm4, mm1);

        // 0,1 / 2,3 / 4,5 / 6,7 / 1=4,5
        movq_r2r(mm0, mm3);
        pand_r2r(mm5, mm0);
        pslld_i2r(16, mm1);
        por_r2r(mm1, mm0);

        // 0,4 / 2,3 / 4,5 / 6,7 / 3=0,1
        movq_r2r(mm6, mm1);
        pand_r2r(mm7, mm6);
        psrld_i2r(16, mm3);
        por_r2r(mm3, mm6);

        // 0,4 / 2,3 / 4,5 / 1,7 / 1=6,7
        movq_r2r(mm2, mm3);
        pslld_i2r(16, mm2);
        pand_r2r(mm5, mm1);
        por_r2r(mm1, mm2);

        // 0,4 / 6,2 / 4,5 / 1,7 / 3=2,3
        psrld_i2r(16, mm4);
        pand_r2r(mm7, mm3);
        por_r2r(mm3, mm4);

        // 0,4 / 6,2 / 5,3 / 1,7           

        MMXMULTADD(mm0, mm1, idct_mmx_col_table);
        MMXMULTADD(mm2, mm3, idct_mmx_col_table+8);
        MMXMULTADD(mm4, mm5, idct_mmx_col_table+16);
        MMXMULTADD(mm6, mm7, idct_mmx_col_table+24);

        paddd_m2r(idct_mmx_col_round[0], mm0);
        paddd_m2r(idct_mmx_col_round[0], mm1);

        /* second stage */

        MMXADDDIFF(mm6, mm4);    
        MMXADDDIFF(mm7, mm5);
        
        /* third stage */
        
        MMXADDDIFF( mm1, mm3 );
        MMXADDDIFF( mm0, mm2 );
        MMXADDDIFF( mm6, mm7 );
    
        /* fourth stage */

        MMXADDDIFF( mm3, mm4 );
        MMXADDDIFF( mm1, mm5 );

        /* fifth stage */

        STOREMM(mm4, dst[0*8], Wcshift);
        STOREMM(mm5, dst[3*8], Wcshift);
        STOREMM(mm1, dst[4*8], Wcshift);
        STOREMM(mm3, dst[7*8], Wcshift);

        /* sixth stage */
        // x6 = (181*x6+128)>>8;
        // x7 = (181*x7+128)>>8;
      
        // actually, this computes, roughly: x6 -= (x6>>8)*75
        movq_r2r(mm6, mm4);
        movq_r2r(mm7, mm5);

        psrad_i2r(2, mm6);
        psrad_i2r(2, mm7);
        psubd_r2r(mm6, mm4);
        psubd_r2r(mm7, mm5);

        psrad_i2r(3, mm6);
        psrad_i2r(3, mm7);
        psubd_r2r(mm6, mm4);
        psubd_r2r(mm7, mm5);

        psrad_i2r(2, mm6);
        psrad_i2r(2, mm7);
        psubd_r2r(mm6, mm4);
        psubd_r2r(mm7, mm5);

        psrad_i2r(1, mm6);
        psrad_i2r(1, mm7);
        psubd_r2r(mm6, mm4);
        psubd_r2r(mm7, mm5);
    
        /* seventh stage */

        MMXADDDIFF( mm2, mm5 );
        MMXADDDIFF( mm0, mm4 );

        /* eighth stage */

        STOREMM(mm5, dst[1*8], Wcshift);
        STOREMM(mm4, dst[2*8], Wcshift);
        STOREMM(mm0, dst[5*8], Wcshift);
        STOREMM(mm2, dst[6*8], Wcshift);
    }

    emms();
}

void IDCT_reference(short *b)
{
  idct_mmx(b);
}

void IDCT_test(short *in,short* out)
{
  int i; for (i=0;i<64;i++) out[i] = in[i]; idct_mmx(out);
}

void IDCT_init(void)
{
}
