/*
 * Copyright (c) 2007 - 2015 Joseph Gaeddert
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

// 
// Floating-point dot product (altivec velocity engine)
//

#include <stdio.h>
#include <stdlib.h>

#include "liquid.internal.h"

#define DEBUG_DOTPROD_RRRF_AV   0

// basic dot product

// basic dot product
//  _h      :   coefficients array [size: 1 x _n]
//  _x      :   input array [size: 1 x _n]
//  _n      :   input lengths
//  _y      :   output dot product
void dotprod_rrrf_run(float *      _h,
                      float *      _x,
                      unsigned int _n,
                      float *      _y)
{
    float r=0;
    unsigned int i;
    for (i=0; i<_n; i++)
        r += _h[i] * _x[i];
    *_y = r;
}

// basic dot product, unrolling loop
//  _h      :   coefficients array [size: 1 x _n]
//  _x      :   input array [size: 1 x _n]
//  _n      :   input lengths
//  _y      :   output dot product
void dotprod_rrrf_run4(float *      _h,
                       float *      _x,
                       unsigned int _n,
                       float *      _y)
{
    float r=0;

    // t = 4*(floor(_n/4))
    unsigned int t=(_n>>2)<<2; 

    // compute dotprod in groups of 4
    unsigned int i;
    for (i=0; i<t; i+=4) {
        r += _h[i]   * _x[i];
        r += _h[i+1] * _x[i+1];
        r += _h[i+2] * _x[i+2];
        r += _h[i+3] * _x[i+3];
    }

    // clean up remaining
    for ( ; i<_n; i++)
        r += _h[i] * _x[i];

    *_y = r;
}


//
// structured dot product
//

struct dotprod_rrrf_s {
    // dotprod length (number of coefficients)
    unsigned int n;

    // coefficients arrays: the altivec velocity engine operates
    // on 128-bit registers which can hold four 32-bit floating-
    // point values.  We need to hold 4 copies of the coefficients
    // to meet all possible alignments to the input data.
    float *h[4];
};

// create the structured dotprod object
dotprod_rrrf dotprod_rrrf_create(float *      _h,
                                 unsigned int _n)
{
    dotprod_rrrf dp = (dotprod_rrrf)malloc(sizeof(struct dotprod_rrrf_s));
    dp->n = _n;

    // create 4 copies of the input coefficients (one for each
    // data alignment).  For example: _h[4] = {1,2,3,4,5,6}
    //  dp->h[0] = {1,2,3,4,5,6}
    //  dp->h[1] = {. 1,2,3,4,5,6}
    //  dp->h[2] = {. . 1,2,3,4,5,6}
    //  dp->h[3] = {. . . 1,2,3,4,5,6}
    unsigned int i,j;
    for (i=0; i<4; i++) {
        dp->h[i] = calloc(1+(dp->n+i-1)/4,sizeof(vector float));
        for (j=0; j<dp->n; j++)
            dp->h[i][j+i] = _h[j];
    }

    return dp;
}

// re-create the structured dotprod object
dotprod_rrrf dotprod_rrrf_recreate(dotprod_rrrf _q,
                                   float *      _h,
                                   unsigned int _n)
{
    // completely destroy and re-create dotprod object
    dotprod_rrrf_destroy(_q);
    return dotprod_rrrf_create(_h,_n);
}

// destroy the structured dotprod object
void dotprod_rrrf_destroy(dotprod_rrrf _q)
{
    // clean up coefficients arrays
    unsigned int i;
    for (i=0; i<4; i++)
        free(_q->h[i]);

    // free allocated object memory
    free(_q);
}

// print the dotprod object
void dotprod_rrrf_print(dotprod_rrrf _q)
{
    printf("dotprod_rrrf [altivec, %u coefficients]:\n", _q->n);
    unsigned int i;
    for (i=0; i<_q->n; i++)
        printf("  %3u : %12.9f\n", i, _q->h[0][i]);
}

// exectue vectorized structured inner dot product
void dotprod_rrrf_execute(dotprod_rrrf _q,
                          float *      _x,
                          float *      _r)
{
    int al; // input data alignment

    vector float *ar,*d;
    vector float s0,s1,s2,s3;
    union { vector float v; float w[4];} s;
    unsigned int nblocks;

    ar = (vector float*)( (int)_x & ~15);
    al = ((int)_x & 15)/sizeof(float);

    d = (vector float*)_q->h[al];

    nblocks = (_q->n + al - 1)/4 + 1;

    // split into four vectors each with four 32-bit
    // partial sums.  Effectively each loop iteration
    // operates on 16 input samples at a time.
    s0 = s1 = s2 = s3 = (vector float)(0);
    while (nblocks >= 4) {
        s0 = vec_madd(ar[nblocks-1],d[nblocks-1],s0);
        s1 = vec_madd(ar[nblocks-2],d[nblocks-2],s1);
        s2 = vec_madd(ar[nblocks-3],d[nblocks-3],s2);
        s3 = vec_madd(ar[nblocks-4],d[nblocks-4],s3);
        nblocks -= 4;
    }

    // fold the resulting partial sums into vector s0
    s0 = vec_add(s0,s1);    // s0 = s0+s1
    s2 = vec_add(s2,s3);    // s2 = s2+s3
    s0 = vec_add(s0,s2);    // s0 = s0+s2

    // finish partial summing operations
    while (nblocks-- > 0)
        s0 = vec_madd(ar[nblocks],d[nblocks],s0);

    // move the result into the union s (effetively,
    // this loads the four 32-bit values in s0 into
    // the array w).
    s.v = vec_add(s0,(vector float)(0));

    // sum the resulting array
    *_r = s.w[0] + s.w[1] + s.w[2] + s.w[3];
}

