#include <stdio.h>
#include <math.h>
#include <xcore/hwtimer.h>
#include "downsample.h"
#include "upsample.h"

//        input_ds[i] = 10000 * (sin(i / 768.0 * 2 * 3.1415) + sin(i/2.1 * 2 * 3.1415));
int input_ds[768] = {
#include "inputs_ds.h"
};

void main_ds() {
    int output[384];
    int output_fast[384];
    int output_vpu[384];
    int t0, t1, t2, t3;
    ds_fast_init();
    // First time the simple C version
    t0 = get_reference_time();
    for(int i = 0; i < 768/2; i++) {
        // Take two inputs to produce an output, 2x downsampling
        output[i] = ds_simple(input_ds[2*i], input_ds[2*i+1]);
    }
    // Now time the lib_xcore_math version
    t1 = get_reference_time();
    for(int i = 0; i < 768/2; i++) {
        // Take two inputs to produce an output, 2x downsampling
        output_fast[i] = ds_fast(input_ds[2*i], input_ds[2*i+1]);
    }
    // Finally time the hand-rolled VPU version.
    t2 = get_reference_time();
    for(int i = 0; i < 768/2; i++) {
        // Take two inputs to produce an output, 2x downsampling
        output_vpu[i] = ds_vpu(input_ds[2*i], input_ds[2*i+1]);
    }
    t3 = get_reference_time();
    t0 = t1-t0;
    t1 = t2-t1;
    t2 = t3-t2;
    printf("Simple: %6d ticks for 384 outputs, %d output samples/ms\n", t0, 384*100000/t0);
    printf("Fast:   %6d ticks for 384 outputs, %d output samples/ms\n", t1, 384*100000/t1);
    printf("VPU:    %6d ticks for 384 outputs, %d output samples/ms\n", t2, 384*100000/t2);
    for(int i = 0; i < 768/2; i++) {
        printf("%3d %6d\n", i, input_ds[2*i]);
        printf("%3d %6d %6d %6d %6d\n", i, input_ds[2*i+1], output_vpu[i], output_fast[i], output[i]);
    }
}

//        input_us[i] = 10000 * (sin(i / 768.0 * 2 * 3.1415));
int input_us[384] = {
#include "inputs_us.h"
};

void main_us() {
    int output[768];
    int output_fast[768];
    int output_vpu[768];
    int t0, t1, t2, t3;
    us_fast_init();
    // First time the simple C version
    t0 = get_reference_time();
    for(int i = 0; i < 768/2; i++) {
        // Take two inputs to produce an output, 2x downsampling
        us_simple(&output[2*i], input_us[i]);
    }
    // Now time the lib_xcore_math version
    t1 = get_reference_time();
    for(int i = 0; i < 768/2; i++) {
        // Take two inputs to produce an output, 2x downsampling
        us_fast(&output_fast[2*i], input_us[i]);
    }
    // Finally time the hand-rolled VPU version.
    t2 = get_reference_time();
    for(int i = 0; i < 768/2; i++) {
        // Take two inputs to produce an output, 2x downsampling
        us_vpu(&output_vpu[2*i], input_us[i]);
    }
    t3 = get_reference_time();
    t0 = t1-t0;
    t1 = t2-t1;
    t2 = t3-t2;
    printf("Simple: %6d ticks for 768 outputs, %d output samples/ms\n", t0, 768*100000/t0);
    printf("Fast:   %6d ticks for 768 outputs, %d output samples/ms\n", t1, 768*100000/t1);
    printf("VPU:    %6d ticks for 768 outputs, %d output samples/ms\n", t2, 768*100000/t2);
    for(int i = 0; i < 768/2; i++) {
        printf("%3d %6d %6d %6d %6d\n", 2*i, input_us[i], output_vpu[2*i], output_fast[2*i], output[2*i]);
        printf("%3d        %6d %6d %6d\n", 2*i+1, output_vpu[2*i+1], output_fast[2*i+1], output[2*i+1]);
    }
}

int main(void) {
    main_us();
}
