diff fft_Example/main.cc @ 2:ccea4e6a1945

add OpenCL example
author Yuhi TOMARI <yuhi@cr.ie.u-ryukyu.ac.jp>
date Tue, 22 Jan 2013 23:19:41 +0900 (2013-01-22)
parents
children f3cfea46e585
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fft_Example/main.cc	Tue Jan 22 23:19:41 2013 +0900
@@ -0,0 +1,882 @@
+
+//
+// File:       main.cpp
+//
+// Version:    <1.0>
+//
+// Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Inc. ("Apple")
+//             in consideration of your agreement to the following terms, and your use,
+//             installation, modification or redistribution of this Apple software
+//             constitutes acceptance of these terms.  If you do not agree with these
+//             terms, please do not use, install, modify or redistribute this Apple
+//             software.
+//
+//             In consideration of your agreement to abide by the following terms, and
+//             subject to these terms, Apple grants you a personal, non - exclusive
+//             license, under Apple's copyrights in this original Apple software ( the
+//             "Apple Software" ), to use, reproduce, modify and redistribute the Apple
+//             Software, with or without modifications, in source and / or binary forms;
+//             provided that if you redistribute the Apple Software in its entirety and
+//             without modifications, you must retain this notice and the following text
+//             and disclaimers in all such redistributions of the Apple Software. Neither
+//             the name, trademarks, service marks or logos of Apple Inc. may be used to
+//             endorse or promote products derived from the Apple Software without specific
+//             prior written permission from Apple.  Except as expressly stated in this
+//             notice, no other rights or licenses, express or implied, are granted by
+//             Apple herein, including but not limited to any patent rights that may be
+//             infringed by your derivative works or by other works in which the Apple
+//             Software may be incorporated.
+//
+//             The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
+//             WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
+//             WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
+//             PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
+//             ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+//
+//             IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
+//             CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+//             SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+//             INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
+//             AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
+//             UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
+//             OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#include <string.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <OpenCL/opencl.h>
+#include "clFFT.h"
+#include <mach/mach_time.h>
+#include <Accelerate/Accelerate.h>
+#include "procs.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdint.h>
+#include <float.h>
+
+#define eps_avg 10.0
+
+#define MAX( _a, _b)    ((_a)>(_b)?(_a) : (_b))
+
+typedef enum {
+    clFFT_OUT_OF_PLACE,
+    clFFT_IN_PLACE,
+}clFFT_TestType;
+
+typedef struct
+{
+    double real;
+    double imag;
+}clFFT_ComplexDouble;
+
+typedef struct
+{
+    double *real;
+    double *imag;
+}clFFT_SplitComplexDouble;
+
+cl_device_id     device_id;
+cl_context       context;
+cl_command_queue queue;
+
+typedef unsigned long long ulong;
+
+double subtractTimes( uint64_t endTime, uint64_t startTime )
+{
+    uint64_t difference = endTime - startTime;
+    static double conversion = 0.0;
+
+    if( conversion == 0.0 )
+    {
+        mach_timebase_info_data_t info;
+        kern_return_t err = mach_timebase_info( &info );
+
+        //Convert the timebase into seconds
+        if( err == 0  )
+            conversion = 1e-9 * (double) info.numer / (double) info.denom;
+    }
+
+    return conversion * (double) difference;
+}
+
+void computeReferenceF(clFFT_SplitComplex *out, clFFT_Dim3 n,
+                      unsigned int batchSize, clFFT_Dimension dim, clFFT_Direction dir)
+{
+    FFTSetup plan_vdsp;
+    DSPSplitComplex out_vdsp;
+    FFTDirection dir_vdsp = dir == clFFT_Forward ? FFT_FORWARD : FFT_INVERSE;
+
+    unsigned int i, j, k;
+    unsigned int stride;
+    unsigned int log2Nx = (unsigned int) log2(n.x);
+    unsigned int log2Ny = (unsigned int) log2(n.y);
+    unsigned int log2Nz = (unsigned int) log2(n.z);
+    unsigned int log2N;
+
+    log2N = log2Nx;
+    log2N = log2N > log2Ny ? log2N : log2Ny;
+    log2N = log2N > log2Nz ? log2N : log2Nz;
+
+    plan_vdsp = vDSP_create_fftsetup(log2N, 2);
+
+    switch(dim)
+    {
+        case clFFT_1D:
+
+            for(i = 0; i < batchSize; i++)
+            {
+                stride = i * n.x;
+                out_vdsp.realp  = out->real  + stride;
+                out_vdsp.imagp  = out->imag  + stride;
+
+                vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
+            }
+            break;
+
+        case clFFT_2D:
+
+            for(i = 0; i < batchSize; i++)
+            {
+                for(j = 0; j < n.y; j++)
+                {
+                    stride = j * n.x + i * n.x * n.y;
+                    out_vdsp.realp = out->real + stride;
+                    out_vdsp.imagp = out->imag + stride;
+
+                    vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
+                }
+            }
+            for(i = 0; i < batchSize; i++)
+            {
+                for(j = 0; j < n.x; j++)
+                {
+                    stride = j + i * n.x  * n.y;
+                    out_vdsp.realp = out->real + stride;
+                    out_vdsp.imagp = out->imag + stride;
+
+                    vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp);
+                }
+            }
+            break;
+
+        case clFFT_3D:
+
+            for(i = 0; i < batchSize; i++)
+            {
+                for(j = 0; j < n.z; j++)
+                {
+                    for(k = 0; k < n.y; k++)
+                    {
+                        stride = k * n.x + j * n.x * n.y + i * n.x * n.y * n.z;
+                        out_vdsp.realp = out->real + stride;
+                        out_vdsp.imagp = out->imag + stride;
+
+                        vDSP_fft_zip(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
+                    }
+                }
+            }
+            for(i = 0; i < batchSize; i++)
+            {
+                for(j = 0; j < n.z; j++)
+                {
+                    for(k = 0; k < n.x; k++)
+                    {
+                        stride = k + j * n.x * n.y + i * n.x * n.y * n.z;
+                        out_vdsp.realp = out->real + stride;
+                        out_vdsp.imagp = out->imag + stride;
+
+                        vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp);
+                    }
+                }
+            }
+            for(i = 0; i < batchSize; i++)
+            {
+                for(j = 0; j < n.y; j++)
+                {
+                    for(k = 0; k < n.x; k++)
+                    {
+                        stride = k + j * n.x + i * n.x * n.y * n.z;
+                        out_vdsp.realp = out->real + stride;
+                        out_vdsp.imagp = out->imag + stride;
+
+                        vDSP_fft_zip(plan_vdsp, &out_vdsp, n.x*n.y, log2Nz, dir_vdsp);
+                    }
+                }
+            }
+            break;
+    }
+
+    vDSP_destroy_fftsetup(plan_vdsp);
+}
+
+void computeReferenceD(clFFT_SplitComplexDouble *out, clFFT_Dim3 n,
+                      unsigned int batchSize, clFFT_Dimension dim, clFFT_Direction dir)
+{
+    FFTSetupD plan_vdsp;
+    DSPDoubleSplitComplex out_vdsp;
+    FFTDirection dir_vdsp = dir == clFFT_Forward ? FFT_FORWARD : FFT_INVERSE;
+
+    unsigned int i, j, k;
+    unsigned int stride;
+    unsigned int log2Nx = (int) log2(n.x);
+    unsigned int log2Ny = (int) log2(n.y);
+    unsigned int log2Nz = (int) log2(n.z);
+    unsigned int log2N;
+
+    log2N = log2Nx;
+    log2N = log2N > log2Ny ? log2N : log2Ny;
+    log2N = log2N > log2Nz ? log2N : log2Nz;
+
+    plan_vdsp = vDSP_create_fftsetupD(log2N, 2);
+
+    switch(dim)
+    {
+        case clFFT_1D:
+
+            for(i = 0; i < batchSize; i++)
+            {
+                stride = i * n.x;
+                out_vdsp.realp  = out->real  + stride;
+                out_vdsp.imagp  = out->imag  + stride;
+
+                vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
+            }
+            break;
+
+        case clFFT_2D:
+
+            for(i = 0; i < batchSize; i++)
+            {
+                for(j = 0; j < n.y; j++)
+                {
+                    stride = j * n.x + i * n.x * n.y;
+                    out_vdsp.realp = out->real + stride;
+                    out_vdsp.imagp = out->imag + stride;
+
+                    vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
+                }
+            }
+            for(i = 0; i < batchSize; i++)
+            {
+                for(j = 0; j < n.x; j++)
+                {
+                    stride = j + i * n.x  * n.y;
+                    out_vdsp.realp = out->real + stride;
+                    out_vdsp.imagp = out->imag + stride;
+
+                    vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp);
+                }
+            }
+            break;
+
+        case clFFT_3D:
+
+            for(i = 0; i < batchSize; i++)
+            {
+                for(j = 0; j < n.z; j++)
+                {
+                    for(k = 0; k < n.y; k++)
+                    {
+                        stride = k * n.x + j * n.x * n.y + i * n.x * n.y * n.z;
+                        out_vdsp.realp = out->real + stride;
+                        out_vdsp.imagp = out->imag + stride;
+
+                        vDSP_fft_zipD(plan_vdsp, &out_vdsp, 1, log2Nx, dir_vdsp);
+                    }
+                }
+            }
+            for(i = 0; i < batchSize; i++)
+            {
+                for(j = 0; j < n.z; j++)
+                {
+                    for(k = 0; k < n.x; k++)
+                    {
+                        stride = k + j * n.x * n.y + i * n.x * n.y * n.z;
+                        out_vdsp.realp = out->real + stride;
+                        out_vdsp.imagp = out->imag + stride;
+
+                        vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x, log2Ny, dir_vdsp);
+                    }
+                }
+            }
+            for(i = 0; i < batchSize; i++)
+            {
+                for(j = 0; j < n.y; j++)
+                {
+                    for(k = 0; k < n.x; k++)
+                    {
+                        stride = k + j * n.x + i * n.x * n.y * n.z;
+                        out_vdsp.realp = out->real + stride;
+                        out_vdsp.imagp = out->imag + stride;
+
+                        vDSP_fft_zipD(plan_vdsp, &out_vdsp, n.x*n.y, log2Nz, dir_vdsp);
+                    }
+                }
+            }
+            break;
+    }
+
+    vDSP_destroy_fftsetupD(plan_vdsp);
+}
+
+double complexNormSq(clFFT_ComplexDouble a)
+{
+    return (a.real * a.real + a.imag * a.imag);
+}
+
+double computeL2Error(clFFT_SplitComplex *data, clFFT_SplitComplexDouble *data_ref, int n, int batchSize, double *max_diff, double *min_diff)
+{
+    int i, j;
+    double avg_norm = 0.0;
+    *max_diff = 0.0;
+    *min_diff = 0x1.0p1000;
+
+    for(j = 0; j < batchSize; j++)
+    {
+        double norm_ref = 0.0;
+        double norm = 0.0;
+        for(i = 0; i < n; i++)
+        {
+            int index = j * n + i;
+            clFFT_ComplexDouble diff = (clFFT_ComplexDouble) { data_ref->real[index] - data->real[index], data_ref->imag[index] - data->imag[index] };
+            double norm_tmp = complexNormSq(diff);
+            norm += norm_tmp;
+            norm_ref += (data_ref->real[index] * data_ref->real[index] + data_ref->imag[index] * data_ref->imag[index]);
+        }
+        double curr_norm = sqrt( norm / norm_ref ) / FLT_EPSILON;
+        avg_norm += curr_norm;
+        *max_diff = *max_diff < curr_norm ? curr_norm : *max_diff;
+        *min_diff = *min_diff > curr_norm ? curr_norm : *min_diff;
+    }
+
+    return avg_norm / batchSize;
+}
+
+void convertInterleavedToSplit(clFFT_SplitComplex *result_split, clFFT_Complex *data_cl, int length)
+{
+    int i;
+    for(i = 0; i < length; i++) {
+        result_split->real[i] = data_cl[i].real;
+        result_split->imag[i] = data_cl[i].imag;
+    }
+}
+
+int runTest(clFFT_Dim3 n, int batchSize, clFFT_Direction dir, clFFT_Dimension dim,
+            clFFT_DataFormat dataFormat, int numIter, clFFT_TestType testType)
+{
+    cl_int err = CL_SUCCESS;
+    int iter;
+    double t;
+
+    uint64_t t0, t1;
+    int mx = (int)log2(n.x);
+    int my = (int)log2(n.y);
+    int mz = (int)log2(n.z);
+
+    int length = n.x * n.y * n.z * batchSize;
+
+    double gflops = 5e-9 * ((double)mx + (double)my + (double)mz) * (double)n.x * (double)n.y * (double)n.z * (double)batchSize * (double)numIter;
+
+    clFFT_SplitComplex data_i_split = (clFFT_SplitComplex) { NULL, NULL };
+    clFFT_SplitComplex data_cl_split = (clFFT_SplitComplex) { NULL, NULL };
+    clFFT_Complex *data_i = NULL;
+    clFFT_Complex *data_cl = NULL;
+    clFFT_SplitComplexDouble data_iref = (clFFT_SplitComplexDouble) { NULL, NULL };
+    clFFT_SplitComplexDouble data_oref = (clFFT_SplitComplexDouble) { NULL, NULL };
+
+    clFFT_Plan plan = NULL;
+    cl_mem data_in = NULL;
+    cl_mem data_out = NULL;
+    cl_mem data_in_real = NULL;
+    cl_mem data_in_imag = NULL;
+    cl_mem data_out_real = NULL;
+    cl_mem data_out_imag = NULL;
+
+    if(dataFormat == clFFT_SplitComplexFormat) {
+        data_i_split.real     = (float *) malloc(sizeof(float) * length);
+        data_i_split.imag     = (float *) malloc(sizeof(float) * length);
+        data_cl_split.real    = (float *) malloc(sizeof(float) * length);
+        data_cl_split.imag    = (float *) malloc(sizeof(float) * length);
+        if(!data_i_split.real || !data_i_split.imag || !data_cl_split.real || !data_cl_split.imag)
+        {
+            err = -1;
+            log_error("Out-of-Resources\n");
+            goto cleanup;
+        }
+    }
+    else {
+        data_i  = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length);
+        data_cl = (clFFT_Complex *) malloc(sizeof(clFFT_Complex)*length);
+        if(!data_i || !data_cl)
+        {
+            err = -2;
+            log_error("Out-of-Resouces\n");
+            goto cleanup;
+        }
+    }
+
+    data_iref.real   = (double *) malloc(sizeof(double) * length);
+    data_iref.imag   = (double *) malloc(sizeof(double) * length);
+    data_oref.real   = (double *) malloc(sizeof(double) * length);
+    data_oref.imag   = (double *) malloc(sizeof(double) * length);
+    if(!data_iref.real || !data_iref.imag || !data_oref.real || !data_oref.imag)
+    {
+        err = -3;
+        log_error("Out-of-Resources\n");
+        goto cleanup;
+    }
+
+    int i;
+    if(dataFormat == clFFT_SplitComplexFormat) {
+        for(i = 0; i < length; i++)
+        {
+            data_i_split.real[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
+            data_i_split.imag[i] = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
+            data_cl_split.real[i] = 0.0f;
+            data_cl_split.imag[i] = 0.0f;
+            data_iref.real[i] = data_i_split.real[i];
+            data_iref.imag[i] = data_i_split.imag[i];
+            data_oref.real[i] = data_iref.real[i];
+            data_oref.imag[i] = data_iref.imag[i];
+        }
+    }
+    else {
+        for(i = 0; i < length; i++)
+        {
+            data_i[i].real = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
+            data_i[i].imag = 2.0f * (float) rand() / (float) RAND_MAX - 1.0f;
+            data_cl[i].real = 0.0f;
+            data_cl[i].imag = 0.0f;
+            data_iref.real[i] = data_i[i].real;
+            data_iref.imag[i] = data_i[i].imag;
+            data_oref.real[i] = data_iref.real[i];
+            data_oref.imag[i] = data_iref.imag[i];
+        }
+    }
+
+    plan = clFFT_CreatePlan( context, n, dim, dataFormat, &err );
+    if(!plan || err)
+    {
+        log_error("clFFT_CreatePlan failed\n");
+        goto cleanup;
+    }
+
+    //clFFT_DumpPlan(plan, stdout);
+
+    if(dataFormat == clFFT_SplitComplexFormat)
+    {
+        data_in_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.real, &err);
+        if(!data_in_real || err)
+        {
+            log_error("clCreateBuffer failed\n");
+            goto cleanup;
+        }
+
+        data_in_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_i_split.imag, &err);
+        if(!data_in_imag || err)
+        {
+            log_error("clCreateBuffer failed\n");
+            goto cleanup;
+        }
+
+        if(testType == clFFT_OUT_OF_PLACE)
+        {
+            data_out_real = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.real, &err);
+            if(!data_out_real || err)
+            {
+                log_error("clCreateBuffer failed\n");
+                goto cleanup;
+            }
+
+            data_out_imag = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float), data_cl_split.imag, &err);
+            if(!data_out_imag || err)
+            {
+                log_error("clCreateBuffer failed\n");
+                goto cleanup;
+            }
+        }
+        else
+        {
+            data_out_real = data_in_real;
+            data_out_imag = data_in_imag;
+        }
+    }
+    else
+    {
+        data_in = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_i, &err);
+        if(!data_in)
+        {
+            log_error("clCreateBuffer failed\n");
+            goto cleanup;
+        }
+        if(testType == clFFT_OUT_OF_PLACE)
+        {
+            data_out = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length*sizeof(float)*2, data_cl, &err);
+            if(!data_out)
+            {
+                log_error("clCreateBuffer failed\n");
+                goto cleanup;
+            }
+        }
+        else
+            data_out = data_in;
+    }
+
+
+    err = CL_SUCCESS;
+
+    t0 = mach_absolute_time();
+    if(dataFormat == clFFT_SplitComplexFormat)
+    {
+        for(iter = 0; iter < numIter; iter++)
+            err |= clFFT_ExecutePlannar(queue, plan, batchSize, dir, data_in_real, data_in_imag, data_out_real, data_out_imag, 0, NULL, NULL);
+    }
+    else
+    {
+        for(iter = 0; iter < numIter; iter++)
+            err |= clFFT_ExecuteInterleaved(queue, plan, batchSize, dir, data_in, data_out, 0, NULL, NULL);
+    }
+
+    err |= clFinish(queue);
+
+    if(err)
+    {
+        log_error("clFFT_Execute\n");
+        goto cleanup;
+    }
+
+    t1 = mach_absolute_time();
+    t = subtractTimes(t1, t0);
+    char temp[100];
+    sprintf(temp, "GFlops achieved for n = (%d, %d, %d), batchsize = %d", n.x, n.y, n.z, batchSize);
+    log_perf(gflops / (float) t, 1, "GFlops/s", "%s", temp);
+
+    if(dataFormat == clFFT_SplitComplexFormat)
+    {
+        err |= clEnqueueReadBuffer(queue, data_out_real, CL_TRUE, 0, length*sizeof(float), data_cl_split.real, 0, NULL, NULL);
+        err |= clEnqueueReadBuffer(queue, data_out_imag, CL_TRUE, 0, length*sizeof(float), data_cl_split.imag, 0, NULL, NULL);
+    }
+    else
+    {
+        err |= clEnqueueReadBuffer(queue, data_out, CL_TRUE, 0, length*sizeof(float)*2, data_cl, 0, NULL, NULL);
+    }
+
+    if(err)
+    {
+        log_error("clEnqueueReadBuffer failed\n");
+        goto cleanup;
+    }
+
+    computeReferenceD(&data_oref, n, batchSize, dim, dir);
+
+    double diff_avg, diff_max, diff_min;
+    if(dataFormat == clFFT_SplitComplexFormat) {
+        diff_avg = computeL2Error(&data_cl_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min);
+        if(diff_avg > eps_avg)
+            log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);
+        else
+            log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);
+    }
+    else {
+        clFFT_SplitComplex result_split;
+        result_split.real = (float *) malloc(length*sizeof(float));
+        result_split.imag = (float *) malloc(length*sizeof(float));
+        convertInterleavedToSplit(&result_split, data_cl, length);
+        diff_avg = computeL2Error(&result_split, &data_oref, n.x*n.y*n.z, batchSize, &diff_max, &diff_min);
+
+        if(diff_avg > eps_avg)
+            log_error("Test failed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);
+        else
+            log_info("Test passed (n=(%d, %d, %d), batchsize=%d): %s Test: rel. L2-error = %f eps (max=%f eps, min=%f eps)\n", n.x, n.y, n.z, batchSize, (testType == clFFT_OUT_OF_PLACE) ? "out-of-place" : "in-place", diff_avg, diff_max, diff_min);
+        free(result_split.real);
+        free(result_split.imag);
+    }
+
+cleanup:
+    clFFT_DestroyPlan(plan);
+    if(dataFormat == clFFT_SplitComplexFormat)
+    {
+        if(data_i_split.real)
+            free(data_i_split.real);
+        if(data_i_split.imag)
+            free(data_i_split.imag);
+        if(data_cl_split.real)
+            free(data_cl_split.real);
+        if(data_cl_split.imag)
+            free(data_cl_split.imag);
+
+        if(data_in_real)
+            clReleaseMemObject(data_in_real);
+        if(data_in_imag)
+            clReleaseMemObject(data_in_imag);
+        if(data_out_real && testType == clFFT_OUT_OF_PLACE)
+            clReleaseMemObject(data_out_real);
+        if(data_out_imag && clFFT_OUT_OF_PLACE)
+            clReleaseMemObject(data_out_imag);
+    }
+    else
+    {
+        if(data_i)
+            free(data_i);
+        if(data_cl)
+            free(data_cl);
+
+        if(data_in)
+            clReleaseMemObject(data_in);
+        if(data_out && testType == clFFT_OUT_OF_PLACE)
+            clReleaseMemObject(data_out);
+    }
+
+    if(data_iref.real)
+        free(data_iref.real);
+    if(data_iref.imag)
+        free(data_iref.imag);
+    if(data_oref.real)
+        free(data_oref.real);
+    if(data_oref.imag)
+        free(data_oref.imag);
+
+    return err;
+}
+
+bool ifLineCommented(const char *line) {
+    const char *Line = line;
+    while(*Line != '\0')
+        if((*Line == '/') && (*(Line + 1) == '/'))
+            return true;
+        else
+            Line++;
+    return false;
+}
+
+cl_device_type getGlobalDeviceType()
+{
+    char *force_cpu = getenv( "CL_DEVICE_TYPE" );
+    if( force_cpu != NULL )
+    {
+        if( strcmp( force_cpu, "gpu" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_GPU" ) == 0 )
+            return CL_DEVICE_TYPE_GPU;
+        else if( strcmp( force_cpu, "cpu" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_CPU" ) == 0 )
+            return CL_DEVICE_TYPE_CPU;
+        else if( strcmp( force_cpu, "accelerator" ) == 0 || strcmp( force_cpu, "CL_DEVICE_TYPE_ACCELERATOR" ) == 0 )
+            return CL_DEVICE_TYPE_ACCELERATOR;
+        else if( strcmp( force_cpu, "CL_DEVICE_TYPE_DEFAULT" ) == 0 )
+            return CL_DEVICE_TYPE_DEFAULT;
+    }
+    // default
+    return CL_DEVICE_TYPE_GPU;
+}
+
+void
+notify_callback(const char *errinfo, const void *private_info, size_t cb, void *user_data)
+{
+    log_error( "%s\n", errinfo );
+}
+
+int
+checkMemRequirements(clFFT_Dim3 n, int batchSize, clFFT_TestType testType, cl_ulong gMemSize)
+{
+    cl_ulong memReq = (testType == clFFT_OUT_OF_PLACE) ? 3 : 2;
+    memReq *= n.x*n.y*n.z*sizeof(clFFT_Complex)*batchSize;
+    memReq = memReq/1024/1024;
+    if(memReq >= gMemSize)
+        return -1;
+    return 0;
+}
+
+int main (int argc, char * const argv[]) {
+
+    test_start();
+
+    cl_ulong gMemSize;
+    clFFT_Direction dir = clFFT_Forward;
+    int numIter = 1;
+    clFFT_Dim3 n = { 1024, 1, 1 };
+    int batchSize = 1;
+    clFFT_DataFormat dataFormat = clFFT_SplitComplexFormat;
+    clFFT_Dimension dim = clFFT_1D;
+    clFFT_TestType testType = clFFT_OUT_OF_PLACE;
+    cl_device_id device_ids[16];
+
+    FILE *paramFile;
+
+    cl_int err;
+    unsigned int num_devices;
+
+    cl_device_type device_type = getGlobalDeviceType();
+    if(device_type != CL_DEVICE_TYPE_GPU)
+        {
+            log_info("Test only supported on DEVICE_TYPE_GPU\n");
+            test_finish();
+            exit(0);
+        }
+
+    err = clGetDeviceIDs(NULL, device_type, sizeof(device_ids), device_ids, &num_devices);
+    if(err)
+        {
+            log_error("clGetComputeDevice failed\n");
+            test_finish();
+            return -1;
+        }
+
+    device_id = NULL;
+
+    unsigned int i;
+    for(i = 0; i < num_devices; i++)
+        {
+            cl_bool available;
+            err = clGetDeviceInfo(device_ids[i], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &available, NULL);
+            if(err)
+                {
+                    log_error("Cannot check device availability of device # %d\n", i);
+                }
+
+            if(available)
+                {
+                    device_id = device_ids[i];
+                    break;
+                }
+            else
+                {
+                    char name[200];
+                    err = clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, sizeof(name), name, NULL);
+                    if(err == CL_SUCCESS)
+                        {
+                            log_info("Device %s not available for compute\n", name);
+                        }
+                    else
+                        {
+                            log_info("Device # %d not available for compute\n", i);
+                        }
+                }
+        }
+
+    if(!device_id)
+        {
+            log_error("None of the devices available for compute ... aborting test\n");
+            test_finish();
+            return -1;
+        }
+
+    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
+    if(!context || err)
+        {
+            log_error("clCreateContext failed\n");
+            test_finish();
+            return -1;
+        }
+
+    queue = clCreateCommandQueue(context, device_id, 0, &err);
+    if(!queue || err)
+        {
+            log_error("clCreateCommandQueue() failed.\n");
+            clReleaseContext(context);
+            test_finish();
+            return -1;
+        }
+
+    err = clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &gMemSize, NULL);
+    if(err)
+        {
+            log_error("Failed to get global mem size\n");
+            clReleaseContext(context);
+            clReleaseCommandQueue(queue);
+            test_finish();
+            return -2;
+        }
+
+    gMemSize /= (1024*1024);
+
+    char delim[] = " \n";
+    char tmpStr[100];
+    char line[200];
+    char *param, *val;
+    int total_errors = 0;
+    if(argc == 1) {
+        log_error("Need file name with list of parameters to run the test\n");
+        test_finish();
+        return -1;
+    }
+
+    if(argc == 2) { // arguments are supplied in a file with arguments for a single run are all on the same line
+        paramFile = fopen(argv[1], "r");
+        if(!paramFile) {
+            log_error("Cannot open the parameter file\n");
+            clReleaseContext(context);
+            clReleaseCommandQueue(queue);
+            test_finish();
+            return -3;
+        }
+        while(fgets(line, 199, paramFile)) {
+            if(!strcmp(line, "") || !strcmp(line, "\n") || ifLineCommented(line))
+                continue;
+            param = strtok(line, delim);
+            while(param) {
+                val = strtok(NULL, delim);
+                if(!strcmp(param, "-n")) {
+                    sscanf(val, "%d", &n.x);
+                    val = strtok(NULL, delim);
+                    sscanf(val, "%d", &n.y);
+                    val = strtok(NULL, delim);
+                    sscanf(val, "%d", &n.z);
+                }
+                else if(!strcmp(param, "-batchsize"))
+                    sscanf(val, "%d", &batchSize);
+                else if(!strcmp(param, "-dir")) {
+                    sscanf(val, "%s", tmpStr);
+                    if(!strcmp(tmpStr, "forward"))
+                        dir = clFFT_Forward;
+                    else if(!strcmp(tmpStr, "inverse"))
+                        dir = clFFT_Inverse;
+                }
+                else if(!strcmp(param, "-dim")) {
+                    sscanf(val, "%s", tmpStr);
+                    if(!strcmp(tmpStr, "1D"))
+                        dim = clFFT_1D;
+                    else if(!strcmp(tmpStr, "2D"))
+                        dim = clFFT_2D;
+                    else if(!strcmp(tmpStr, "3D"))
+                        dim = clFFT_3D;
+                }
+                else if(!strcmp(param, "-format")) {
+                    sscanf(val, "%s", tmpStr);
+                    if(!strcmp(tmpStr, "plannar"))
+                        dataFormat = clFFT_SplitComplexFormat;
+                    else if(!strcmp(tmpStr, "interleaved"))
+                        dataFormat = clFFT_InterleavedComplexFormat;
+                }
+                else if(!strcmp(param, "-numiter"))
+                    sscanf(val, "%d", &numIter);
+                else if(!strcmp(param, "-testtype")) {
+                    sscanf(val, "%s", tmpStr);
+                    if(!strcmp(tmpStr, "out-of-place"))
+                        testType = clFFT_OUT_OF_PLACE;
+                    else if(!strcmp(tmpStr, "in-place"))
+                        testType = clFFT_IN_PLACE;
+                }
+                param = strtok(NULL, delim);
+            }
+
+            if(checkMemRequirements(n, batchSize, testType, gMemSize)) {
+                log_info("This test cannot run because memory requirements canot be met by the available device\n");
+                continue;
+            }
+
+            err = runTest(n, batchSize, dir, dim, dataFormat, numIter, testType);
+            if (err)
+                total_errors++;
+        }
+    }
+
+    clReleaseContext(context);
+    clReleaseCommandQueue(queue);
+
+    test_finish();
+    return total_errors;
+}