changeset 2021:8130b38b5391 draft

merge
author Masataka Kohagura <kohagura@cr.ie.u-ryukyu.ac.jp>
date Wed, 16 Jul 2014 01:28:20 +0900
parents 6849865f96eb (diff) 630eef931336 (current diff)
children fac44ad2867d
files
diffstat 34 files changed, 1006 insertions(+), 64 deletions(-) [+]
line wrap: on
line diff
--- a/TaskManager/Cuda/CudaScheduler.cc	Tue Jun 17 14:17:39 2014 +0900
+++ b/TaskManager/Cuda/CudaScheduler.cc	Wed Jul 16 01:28:20 2014 +0900
@@ -13,6 +13,9 @@
 #include <sys/stat.h>
 #include <string.h>
 #include <cuda.h>
+#include <map>
+
+using namespace std;
 
 TaskObject cuda_task_list[MAX_TASK_OBJECT];
 
@@ -125,47 +128,39 @@
         // parameter is passed as first kernel arg 
         ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], nextTask->param(0), sizeof(memaddr)*nextTask->param_count, cudabuffer[cur].stream);
         if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
-        
+        cudabuffer[cur].kernelParams[param] = &cudabuffer[cur].memin[param];
         param++;
         
         for(int i=0;i<nextTask->inData_count;i++) {
             ListElement *input_buf = nextTask->inData(i);
             if (input_buf->size==0) break;
-            createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, input_buf->size);
-            if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
-            ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], input_buf->addr, input_buf->size, cudabuffer[cur].stream);
-            if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
-            
+            if (!transmitted.count(input_buf->addr)) {
+                createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, input_buf->size);
+                if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
+                ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], input_buf->addr, input_buf->size, cudabuffer[cur].stream);
+                if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
+                transmitted.insert(make_pair(input_buf->addr, &cudabuffer[cur].memin[param]));
+                reverse_map.insert(make_pair(&cudabuffer[cur].memin[param], input_buf->addr));
+            }
+            cudabuffer[cur].kernelParams[param] = transmitted[input_buf->addr];
             param++;
         }
+
         cudabuffer[cur].in_size = param; // +1 means param
         
         for(int i = 0; i<nextTask->outData_count;i++) { // set output data
             ListElement *output_buf = nextTask->outData(i);
             if (output_buf->size==0) break;
-            if (!flag[cur].flip) { // flip use memin for output 
+            if (!transmitted.count(output_buf->addr)) {
                 createBuffer(&cudabuffer[cur], cudabuffer[cur].memout, i, output_buf->size);
                 if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
-                // enqueue later
+                transmitted.insert(make_pair(output_buf->addr, &cudabuffer[cur].memout[i]));
+                reverse_map.insert(make_pair(&cudabuffer[cur].memout[i], output_buf->addr));
+                cudabuffer[cur].kernelParams[param] = transmitted[output_buf->addr];
+                param++;
             }
-            param++;
         }
         cudabuffer[cur].out_size = param - cudabuffer[cur].in_size; // no buffer on flip, but flip use memout event
-        
-        if (!flag[cur].flip) {
-            for (int i = 0; i<cudabuffer[cur].in_size; i++) {
-                cudabuffer[cur].kernelParams[i] = &cudabuffer[cur].memin[i];
-            }
-            for (int i = 0; i<cudabuffer[cur].out_size; i++) {
-                cudabuffer[cur].kernelParams[i+cudabuffer[cur].in_size] = &cudabuffer[cur].memout[i];
-            }
-        } else {
-            for (int i = 0; i<cudabuffer[cur].in_size; i++) {
-                cudabuffer[cur].kernelParams[i] = &cudabuffer[cur].memin[i];
-            }
-        }
-        
-        if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; }
     }
     return cur;
 }
@@ -193,33 +188,36 @@
     int cur = 0;
     for (;nextTask < tasklist->last(); nextTask = nextTask->next(), cur++) {
         if (STAGE <= cur) break;
+        // enable flip : not data transfer device to host
+        if (flag[cur].flip) continue;
         for(int i=0;i<nextTask->outData_count;i++) { // read output data
             ListElement *output_buf = nextTask->outData(i);
             if (output_buf->size==0) break;
-            CUdeviceptr* mem = flag[cur].flip ? cudabuffer[cur].memin : cudabuffer[cur].memout ;
-            int i0 = flag[cur].flip ? i+1 : i ;
-            // flip use memin buffer and memout event
-            ret = cuMemcpyDtoHAsync(output_buf->addr, mem[i0], output_buf->size, cudabuffer[cur].stream);
-            if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
+            if (transmitted.count(output_buf->addr)) {
+                ret = cuMemcpyDtoHAsync(output_buf->addr, *transmitted[output_buf->addr], output_buf->size, cudabuffer[cur].stream);
+                if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
+                reverse_map.erase(transmitted[output_buf->addr]);
+                transmitted.erase(output_buf->addr);
+            }
         }
     }
     return nextTask;
 }
 
 static void
-release_buf_event(int cur, CudaScheduler::CudaBufferPtr mem) {
+release_buf_event(int cur, CudaScheduler::CudaBufferPtr mem, map<CUdeviceptr*, memaddr> map) {
     for (int i=0; i<mem[cur].in_size; i++) {
-        if (mem[cur].memin[i])
+        if (!map.count(&mem[cur].memin[i])) {
             cuMemFree(mem[cur].memin[i]);
-        mem[cur].memin[i] = 0;
+            mem[cur].memin[i] = 0;
+        }
     }
     for (int i=0; i<mem[cur].out_size; i++) {
-        if (mem[cur].memout[i])
+        if (!map.count(&mem[cur].memout[i])) {
             cuMemFree(mem[cur].memout[i]);
-        mem[cur].memout[i] = 0;
+            mem[cur].memout[i] = 0;
+        }
     }
-    mem[cur].in_size = 0;
-    mem[cur].out_size = 0;
 }
 
 void
@@ -244,7 +242,7 @@
     
     for (int i=0;i<cur;i++) {
         if (cudabuffer[i].in_size > 0 || cudabuffer[i].out_size > 0)
-            release_buf_event(i, cudabuffer);
+            release_buf_event(i, cudabuffer, reverse_map);
     }
 
     if(reply) {
--- a/TaskManager/Cuda/CudaScheduler.h	Tue Jun 17 14:17:39 2014 +0900
+++ b/TaskManager/Cuda/CudaScheduler.h	Wed Jul 16 01:28:20 2014 +0900
@@ -7,9 +7,12 @@
 #include "HTask.h"
 #include "TaskManager.h"
 #include <cuda.h>
+#include <map>
 
 extern TaskObject cuda_task_list[MAX_TASK_OBJECT];
 
+using namespace std;
+
 #define STAGE 8
 
 class CudaScheduler : public MainScheduler {
@@ -33,7 +36,6 @@
     // platform は OpenCL が複数のメーカーの GPU に対応してるから必要
     // Cuda の場合、NVIDIA だけなので必要ない?
     CUdevice device;
-    unsigned int ret_num_platforms; // たぶん要らない
     int ret_num_devices;
     CUcontext context;
     // command_queue command_queue;
@@ -42,11 +44,13 @@
     int ret;
     memaddr reply;
     // cl_kernel に相当
-    // 変数名は function にすべきか kernel にすべきか
-    // とりあえず、kernel で
     CUfunction kernel[STAGE];
     CudaBuffer cudabuffer[STAGE];
     
+    // record transmitted data.
+    map<memaddr, CUdeviceptr*> transmitted;
+    map<CUdeviceptr*, memaddr> reverse_map;
+
     HTask::htask_flag flag[STAGE];
     
  private:
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TaskManager/test/flipTest/Func.h	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,6 @@
+enum {
+#include "SysTasks.h"
+    ADD_TASK,
+};
+
+#define DATA_NUM 10
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TaskManager/test/flipTest/Makefile	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,19 @@
+default: macosx
+
+macosx: FORCE
+	@echo "Make for Mac OS X"
+	@$(MAKE) -f Makefile.macosx
+
+fifo64: FORCE
+	@echo "Make for Mac OS X 64bit mode"
+	@$(MAKE) -f Makefile.macosx ABIBIT=64
+
+cuda: FORCE
+	@echo "Make for GPU (cuda)"
+	@$(MAKE) -f Makefile.cuda
+
+FORCE:
+
+clean:
+	@$(MAKE) -f Makefile.macosx clean
+	@$(MAKE) -f Makefile.cuda clean
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TaskManager/test/flipTest/Makefile.cuda	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,51 @@
+include ./Makefile.def
+
+SRCS_TMP = $(wildcard *.cc)
+SRCS_EXCLUDE =  # 除外するファイルを書く
+SRCS = $(filter-out $(SRCS_EXCLUDE),$(SRCS_TMP))
+OBJS = $(SRCS:.cc=.o)
+
+TASK_DIR  = ppe
+CUDA_TASK_DIR = cuda
+
+TASK_SRCS_TMP = $(wildcard $(TASK_DIR)/*.cc)
+TASK_SRCS_EXCLUDE = 
+TASK_SRCS = $(filter-out $(TASK_DIR)/$(TASK_SRCS_EXCLUDE),$(TASK_SRCS_TMP)) $(wildcard $(CUDA_TASK_DIR)/*.cc)
+TASK_OBJS = $(TASK_SRCS:.cc=.o)
+
+CUDA_SRCS_TMP = $(wildcard $(CUDA_TASK_DIR)/*.cu)
+CUDA_SRCS_EXCLUDE = # 除外するファイルを書く
+CUDA_SRCS = $(filter-out $(CUDA_TASK_DIR)/$(CUDA_SRCS_EXCLUDE),$(CUDA_SRCS_TMP))
+CUDA_OBJS = $(CUDA_SRCS:.cu=.ptx)
+
+CFLAGS += -D__CERIUM_GPU__
+LIBS += `sdl-config --libs` -lCudaManager -F/Library/Frameworks -framework CUDA
+
+INCLUDE += -I$(CUDA_PATH)
+
+NVCC = nvcc
+NVCCFLAGS = -ptx -arch=sm_20
+
+.SUFFIXES: .cc .o .cu .ptx
+
+.cc.o:
+	$(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@
+
+.cu.ptx:
+	$(NVCC) $(NVCCFLAGS) $< -o $@
+
+all: $(TARGET) 
+
+$(TARGET): $(OBJS) $(TASK_OBJS) $(CUDA_OBJS)
+	$(CC) -o $@ $(OBJS) $(TASK_OBJS) $(LIBS)
+
+link:
+	$(CC) -o $(TARGET) $(OBJS) $(TASK_OBJS) $(LIBS)
+
+debug: $(TARGET)
+	sudo ppu-gdb ./$(TARGET) 
+
+clean:
+	rm -f $(TARGET) $(OBJS) $(TASK_OBJS) $(CUDA_OBJS)
+	rm -f *~ \#*
+	rm -f cuda/*~ cuda/\#*
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TaskManager/test/flipTest/Makefile.def	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,15 @@
+TARGET = flipTest
+
+# include/library path
+# ex  macosx
+#CERIUM = /Users/gongo/Source/Cerium
+
+# ex  linux/ps3
+CERIUM = ../../../../Cerium
+
+CC      = clang++
+OPT = -g
+CFLAGS  = $(OPT) -Wall 
+
+INCLUDE = -I..  -I. -I${CERIUM}/include/TaskManager
+LIBS = -L${CERIUM}/TaskManager
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TaskManager/test/flipTest/Makefile.macosx	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,39 @@
+include ./Makefile.def
+
+SRCS_TMP = $(wildcard *.cc)
+SRCS_EXCLUDE =  # 除外するファイルを書く
+SRCS = $(filter-out $(SRCS_EXCLUDE),$(SRCS_TMP))
+OBJS = $(SRCS:.cc=.o)
+ABIBIT=64
+
+TASK_DIR = ppe
+#GPU_TASK_DIR = gpu
+TASK_SRCS_TMP = $(wildcard $(TASK_DIR)/*.cc)
+TASK_SRCS_EXCLUDE = 
+TASK_SRCS = $(filter-out $(TASK_DIR)/$(TASK_SRCS_EXCLUDE),$(TASK_SRCS_TMP))
+TASK_OBJS = $(TASK_SRCS:.cc=.o)
+
+LIBS += -lFifoManager `sdl-config --libs`
+CC += -m$(ABIBIT)
+
+.SUFFIXES: .cc .o
+
+.cc.o:
+	$(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@
+
+all: $(TARGET)
+
+$(TARGET): $(OBJS) $(TASK_OBJS)
+	$(CC) -o $@ $(OBJS) $(TASK_OBJS) $(LIBS)
+
+link:
+	$(CC) -o $(TARGET) $(OBJS) $(TASK_OBJS) $(LIBS)
+
+debug: $(TARGET)
+	sudo gdb ./$(TARGET) 
+
+clean:
+	rm -f $(TARGET) $(OBJS) $(TASK_OBJS)
+	rm -f *~ \#*
+	rm -f ppe/*~ ppe/\#*
+	rm -f spe/*~ spe/\#*
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TaskManager/test/flipTest/cuda/add.cu	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,6 @@
+extern "C" {
+    __global__ void add(long* params, int* A, int* B) {
+        int id = blockIdx.x * blockDim.x + threadIdx.x;
+        B[id] = B[id]+A[id];
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TaskManager/test/flipTest/cuda/gpu_task_init.cc	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,16 @@
+#include "Func.h"
+#include "CudaScheduler.h"
+
+/* 必ずこの位置に書いて */
+
+/**
+ * この関数は ../spe/spe-main と違って
+ * 自分で呼び出せばいい関数なので
+ * 好きな関数名でおk (SchedRegisterTask は必須)
+ */
+
+void
+gpu_task_init(void)
+{
+    CudaSchedRegister(ADD_TASK, "cuda/add.ptx","add");
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TaskManager/test/flipTest/main.cc	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,132 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include "TaskManager.h"
+#include "Func.h"
+
+extern void task_init(void);
+extern void gpu_task_init(void);
+static int task = 3;
+static int length = DATA_NUM;
+static CPU_TYPE spe_cpu = SPE_ANY;
+const char *usr_help_str = "Usage: ./multiply \n";
+static int print_flag = 0;
+static int flip_flag = 0;
+void TMend(TaskManager *);
+
+int* A;
+int* B;
+
+static double st_time = 0 ;
+static double ed_time = 0;
+
+static double
+getTime() {
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_sec + (double)tv.tv_usec*1e-6;
+}
+
+static void
+print_result() {
+    printf("---\n");
+    if(print_flag == 1) {
+        for (int i =0;i<length;i++) {
+            printf("%d\n",B[i]);
+        }
+    }
+    printf("---\n");
+}
+
+static void
+check_data() {
+    for (int i=0;i<length;i++) {
+        if(A[i] != i+task) {
+            puts("Incorrect.");
+            return;
+        }
+    }
+    puts("Correct.");
+}
+
+void
+init(int args, char *argv[]) {
+    for (int i = 1; argv[i]; ++i) {
+        if (strcmp(argv[i], "--length") == 0 || strcmp(argv[i], "-l") == 0) {
+            length = atoi(argv[++i]);
+        } else if (strcmp(argv[i], "-t") == 0) {
+            task = atoi(argv[++i]);
+        } else if (strcmp(argv[i], "--print") ==0) {
+            print_flag = 1;
+        } else if (strcmp(argv[i], "-any") == 0) {
+            spe_cpu = ANY_ANY;
+        } else if (strcmp(argv[i], "-g") == 0) {
+            spe_cpu = GPU_0;
+        } else if (strcmp(argv[i], "--flip") ==0 || strcmp(argv[i], "-f") == 0) {
+            flip_flag = 1;
+        }
+    }
+}
+
+void
+add_init(TaskManager *manager)
+{
+    HTask* add[task];
+
+    A = new int[length];
+    B = new int[length];
+
+    for(int i=0; i<length; i++) {
+        A[i]=i;
+        B[i]=0;
+    }
+
+    for (int i=0; i<task-1; i++) {
+        add[i] = manager->create_task(ADD_TASK);
+        add[i]->set_cpu(spe_cpu);
+        add[i]->set_inData(0,(memaddr)A, sizeof(int)*length);
+        add[i]->set_inData(1,(memaddr)B, sizeof(int)*length);
+        add[i]->set_outData(0,(memaddr)B, sizeof(int)*length);
+        if (flip_flag == 1)
+            add[i]->flip();
+        if (i != 0)
+            add[i]->wait_for(add[i-1]);
+        add[i]->iterate(length); 
+    }
+
+    add[task-1] = manager->create_task(ADD_TASK);
+    add[task-1]->set_cpu(spe_cpu);
+    add[task-1]->set_inData(0,(memaddr)A, sizeof(int)*length);
+    add[task-1]->set_inData(1,(memaddr)B, sizeof(int)*length);
+    add[task-1]->set_outData(0,(memaddr)B, sizeof(int)*length);
+    if (task >= 2)
+        add[task-1]->wait_for(add[task-2]);
+    add[task-1]->iterate(length); 
+}
+
+
+int
+TMmain(TaskManager *manager,int argc, char *argv[])
+{
+    init(argc, argv);
+    // Task Register
+    task_init();
+    gpu_task_init();
+    add_init(manager);
+    st_time = getTime();
+    manager->set_TMend(TMend);
+    return 0;
+}
+
+void
+TMend(TaskManager *manager)
+{
+    ed_time = getTime();
+    print_result();
+    printf("Time: %0.6f\n",ed_time-st_time);
+    //check_data();
+
+    delete[] A;
+    delete[] B;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TaskManager/test/flipTest/ppe/Add.cc	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,19 @@
+#include <stdio.h>
+#include "SchedTask.h"
+#include "Multi.h"
+#include "Func.h"
+#include "GpuScheduler.h"
+
+/* これは必須 */
+SchedDefineTask(Add);
+
+static int
+run(SchedTask *s, void *rbuf, void *wbuf)
+{
+    int* A = (int*)s->get_input(rbuf, 0);
+    int* B = (int*)s->get_output(wbuf,0);
+    long i = (long)s->x;
+
+    B[i] = B[i]+A[i];
+    return 0;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TaskManager/test/flipTest/ppe/Add.h	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,16 @@
+#ifndef INCLUDED_TASK_HELLO
+#define INCLUDED_TASK_HELLO
+
+#ifndef INCLUDED_SCHED_TASK
+#include "SchedTask.h"
+#endif
+/*
+class Twice : public SchedTask {
+public:
+    SchedConstructor(Twice);
+    
+    int run(void *r, void *w);
+};
+ */
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TaskManager/test/flipTest/ppe/task_init.cc	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,16 @@
+#include "Func.h"
+#include "Scheduler.h"
+
+/* 必ずこの位置に書いて */
+SchedExternTask(Add);
+/**
+ * この関数は ../spe/spe-main と違って
+ * 自分で呼び出せばいい関数なので
+ * 好きな関数名でおk (SchedRegisterTask は必須)
+ */
+
+void
+task_init(void)
+{
+    SchedRegisterTask(ADD_TASK, Add);
+}
--- a/example/cuda_fft/Makefile.def	Tue Jun 17 14:17:39 2014 +0900
+++ b/example/cuda_fft/Makefile.def	Wed Jul 16 01:28:20 2014 +0900
@@ -5,4 +5,4 @@
 CC = clang++
 NVCC = nvcc
 CFLAGS = -Wall $(OPT)
-NVCCFLAGS = -ptx -arch=sm_20
\ No newline at end of file
+NVCCFLAGS = -ptx -arch=sm_20 #-g -G
\ No newline at end of file
--- a/example/fft/cuda/butterfly.cu	Tue Jun 17 14:17:39 2014 +0900
+++ b/example/fft/cuda/butterfly.cu	Wed Jul 16 01:28:20 2014 +0900
@@ -1,6 +1,6 @@
 extern "C" {
     __global__ void
-    butterfly(long* param, float* x_in, float* w, float* x_out)
+    butterfly(long* param, float* x, float* w)
     {
         unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0);
         unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1);
@@ -22,10 +22,10 @@
         
         float xa[2], xb[2], xbxx[2], xbyy[2], wab[2], wayx[2], wbyx[2], resa[2], resb[2];
         
-        xa[0] = x_in[2*a];
-        xa[1] = x_in[2*a+1];
-        xb[0] = x_in[2*b];
-        xb[1] = x_in[2*b+1];
+        xa[0] = x[2*a];
+        xa[1] = x[2*a+1];
+        xb[0] = x[2*b];
+        xb[1] = x[2*b+1];
         xbxx[0] = xbxx[1] = xb[0];
         xbyy[0] = xbyy[1] = xb[1];
         
@@ -48,9 +48,9 @@
         resb[0] = xa[0] - xbxx[0]*wab[0] + xbyy[0]*wbyx[0];
         resb[1] = xa[1] - xbxx[1]*wab[1] + xbyy[1]*wbyx[1];
 
-        x_out[2*a] = resa[0];
-        x_out[2*a+1] = resa[1];
-        x_out[2*b] = resb[0];
-        x_out[2*b+1] = resb[1];
+        x[2*a] = resa[0];
+        x[2*a+1] = resa[1];
+        x[2*b] = resb[0];
+        x[2*b+1] = resb[1];
     }
 }
--- a/example/fft/cuda/norm.cu	Tue Jun 17 14:17:39 2014 +0900
+++ b/example/fft/cuda/norm.cu	Wed Jul 16 01:28:20 2014 +0900
@@ -1,13 +1,13 @@
 extern "C" {
     __global__ void
-    norm(long* param, float* in_x,float* out_x)
+    norm(long* param, float* x)
     {
         unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0);
         unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; //(unsigned long)s->get_param(1);
         
         long n = param[0];
         
-        out_x[(nid*n+gid)*2] = in_x[(nid*n+gid)*2] / (float)n;
-        out_x[(nid*n+gid)*2+1] = in_x[(nid*n+gid)*2+1] / (float)n;
+        x[(nid*n+gid)*2] = x[(nid*n+gid)*2] / (float)n;
+        x[(nid*n+gid)*2+1] = x[(nid*n+gid)*2+1] / (float)n;
     }
 }
--- a/example/fft/main.cc	Tue Jun 17 14:17:39 2014 +0900
+++ b/example/fft/main.cc	Wed Jul 16 01:28:20 2014 +0900
@@ -107,7 +107,7 @@
 }
 
 HTask*
-fftCore(TaskManager *manager,cl_float2 *dst, cl_float2 *src, cl_float2 *spin, long m, enum Mode direction,HTask* waitTask)
+fftCore(TaskManager *manager,cl_float2 *dst, cl_float2 *src, cl_float2 *spin, long m, enum Mode direction, HTask* waitTask, bool last)
 {
     long direction_flag;
     switch (direction) {
@@ -126,6 +126,7 @@
     brev->set_inData(0, src, length_src*sizeof(cl_float2));
     brev->set_outData(0, dst, length_dst*sizeof(cl_float2));
     brev->set_cpu(spe_cpu);
+    brev->flip();
     brev->wait_for(waitTask);
     brev->iterate(gws[0],gws[1]);
 
@@ -139,8 +140,9 @@
         bfly->set_param(2,(long)iter);
         bfly->set_inData(0, dst, length_dst*sizeof(cl_float2));
         bfly->set_inData(1, spin, sizeof(cl_float2)*(n/2));
-        bfly->set_outData(0,dst,length_dst*sizeof(cl_float2));
+        bfly->set_outData(0, dst,length_dst*sizeof(cl_float2));
         bfly->set_cpu(spe_cpu);
+        bfly->flip();
         bfly->wait_for(waitTask);
         bfly->iterate(gws[0],gws[1]);
         waitTask = bfly;
@@ -149,7 +151,9 @@
     if (direction == inverse) { 
         setWorkSize(gws,lws,n,n);
         HTask *norm = manager->create_task(NORMALIZATION);
-        norm->set_inData(0,dst,length_dst*sizeof(cl_float2));
+        norm->set_inData(0, dst,length_dst*sizeof(cl_float2));
+        if (!last)
+            norm->flip();
         norm->set_outData(0, dst, length_dst*sizeof(cl_float2));
         norm->set_param(0,n);
         norm->set_cpu(spe_cpu);
@@ -163,9 +167,9 @@
 
 char *
 init(int argc, char**argv){
-
+    
     char *filename = 0;
-
+    
     //    printf("%s ",argv[4]);
     for (int i = 1; argv[i]; ++i) {
         if (strcmp(argv[i], "-file") == 0) {
@@ -191,11 +195,11 @@
     long m = (cl_int)(log((double)n)/log(2.0));
     size_t *gws = new size_t[2];
     size_t *lws = new size_t[2];
-
+    
     xm = (cl_float2 *)malloc(n * n * sizeof(cl_float2));
     rm = (cl_float2 *)malloc(n * n * sizeof(cl_float2));
     wm = (cl_float2 *)malloc(n / 2 * sizeof(cl_float2));
-
+    
     HTask* waitTask;
     /*
      * [cl_float2]
@@ -226,10 +230,11 @@
     sfac->set_outData(0, wm, length_w*sizeof(cl_float2));
     sfac->set_param(0,n);
     sfac->set_cpu(spe_cpu);
+    sfac->flip();
     sfac->iterate(gws[0]);
 
     // Butterfly Operation
-    waitTask = fftCore(manager, rm, xm, wm, m, forward,sfac);
+    waitTask = fftCore(manager, rm, xm, wm, m, forward, sfac, false);
 
     // Transpose matrix 
     int length_r =n*n;
@@ -239,11 +244,12 @@
     first_trns->set_outData(0,xm,length_r*sizeof(cl_float2));
     first_trns->set_param(0,n);
     first_trns->set_cpu(spe_cpu);
+    first_trns->flip();
     first_trns->wait_for(waitTask);
     first_trns->iterate(gws[0],gws[1]);
 
     // Butterfly Operation 
-    waitTask = fftCore(manager, rm, xm, wm, m, forward,first_trns);
+    waitTask = fftCore(manager, rm, xm, wm, m, forward, first_trns, false);
 
     // Apply high-pass filter
     HTask *hpfl = manager->create_task(HIGH_PASS_FILTER);
@@ -254,13 +260,14 @@
     hpfl->set_param(0,n);
     hpfl->set_param(1,(long)radius);
     hpfl->set_cpu(spe_cpu);
+    hpfl->flip();
     hpfl->wait_for(waitTask);
     hpfl->iterate(gws[0],gws[1]);
 
     // Inverse FFT
 
     // Butterfly Operation
-    waitTask = fftCore(manager,xm, rm, wm, m, inverse,hpfl);
+    waitTask = fftCore(manager,xm, rm, wm, m, inverse, hpfl, false);
 
     // Transpose matrix
     setWorkSize(gws,lws,n,n);
@@ -269,12 +276,13 @@
     second_trns->set_outData(0,rm,length_r*sizeof(cl_float2));
     second_trns->set_param(0,n);
     second_trns->set_cpu(spe_cpu);
+    second_trns->flip();
     second_trns->wait_for(waitTask);
     second_trns->iterate(gws[0],gws[1]);
 
     // Butterfly Operation
 
-    waitTask = fftCore(manager,xm, rm, wm, m, inverse,second_trns);
+    waitTask = fftCore(manager,xm, rm, wm, m, inverse, second_trns, true);
 }
 
 int TMmain(TaskManager *manager, int argc, char** argv) {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/synthesizer/AudioData.h	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,16 @@
+#include <SDL_audio.h>
+typedef struct audioData {
+    struct audioData *self;
+    int volume;
+    double Frequency;
+    char *waveform_name;
+
+    int freq;
+    Uint16 format;
+    Uint8 channels;
+    Uint8 silence;
+    Uint16 samples;
+    Uint32 size;
+    //void (*callback)(void *userdata, Uint8 *stream, int len);
+    void *userdata;
+} AudioData, *AudioDataPtr;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/synthesizer/Func.h	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,9 @@
+enum {
+#include "SysTasks.h"
+    OSC_TASK,
+};
+
+#define DATA_NUM 16
+#define ADD_NUM 26
+
+#define DATA_ID 0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/synthesizer/Makefile	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,40 @@
+default: macosx
+
+macosx: FORCE
+	@echo "Make for Mac OS X"
+	@$(MAKE) -f Makefile.macosx
+
+linux: FORCE
+	@echo "Make for Linux"
+	@$(MAKE) -f Makefile.linux
+
+cell: FORCE
+	@echo "Make for CELL (Cell)"
+	@$(MAKE) -f Makefile.cell
+
+gpu: FORCE
+	@echo "Make for OpenCL"
+	@$(MAKE) -f Makefile.gpu
+
+cuda: FORCE
+	@echo "Make for Cuda"
+	@$(MAKE) -f Makefile.cuda
+
+test:
+	./word_count -file c.txt
+
+parallel-test: macosx
+	@$(MAKE) -f Makefile.macosx test
+
+gpu-test: FORCE
+	@echo "Make for OpenCL"
+	@$(MAKE) -f Makefile.gpu test
+
+
+FORCE:
+
+clean:
+	@$(MAKE) -f Makefile.macosx clean
+	@$(MAKE) -f Makefile.linux clean
+	@$(MAKE) -f Makefile.cell clean
+	@$(MAKE) -f Makefile.cuda clean
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/synthesizer/Makefile.cell	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,39 @@
+include ./Makefile.def
+
+SRCS_TMP = $(wildcard *.cc)
+SRCS_EXCLUDE =  # 除外するファイルを書く
+SRCS = $(filter-out $(SRCS_EXCLUDE),$(SRCS_TMP))
+OBJS = $(SRCS:.cc=.o)
+
+TASK_DIR  = ppe
+TASK_SRCS_TMP = $(wildcard $(TASK_DIR)/*.cc)
+TASK_SRCS_EXCLUDE = 
+TASK_SRCS = $(filter-out $(TASK_DIR)/$(TASK_SRCS_EXCLUDE),$(TASK_SRCS_TMP))
+TASK_OBJS = $(TASK_SRCS:.cc=.o)
+
+LIBS += -lCellManager -lspe2 -lpthread -Wl,--gc-sections 
+
+.SUFFIXES: .cc .o
+
+.cc.o:
+	$(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@
+
+all: $(TARGET) speobject
+
+$(TARGET): $(OBJS) $(TASK_OBJS)
+	$(CC) -o $@ $(OBJS) $(TASK_OBJS) $(LIBS)
+
+speobject:
+	cd spe; $(MAKE)
+
+link:
+	$(CC) -o $(TARGET) $(OBJS) $(TASK_OBJS) $(LIBS)
+
+debug: $(TARGET)
+	sudo ppu-gdb ./$(TARGET) 
+
+clean:
+	rm -f $(TARGET) $(OBJS) $(TASK_OBJS)
+	rm -f *~ \#*
+	rm -f ppe/*~ ppe/\#*
+	cd spe; $(MAKE) clean
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/synthesizer/Makefile.cuda	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,51 @@
+include ./Makefile.def
+
+SRCS_TMP = $(wildcard *.cc)
+SRCS_EXCLUDE =  # 除外するファイルを書く
+SRCS = $(filter-out $(SRCS_EXCLUDE),$(SRCS_TMP))
+OBJS = $(SRCS:.cc=.o)
+
+TASK_DIR  = ppe
+CUDA_TASK_DIR = cuda
+
+TASK_SRCS_TMP = $(wildcard $(TASK_DIR)/*.cc)
+TASK_SRCS_EXCLUDE = 
+TASK_SRCS = $(filter-out $(TASK_DIR)/$(TASK_SRCS_EXCLUDE),$(TASK_SRCS_TMP)) $(wildcard $(CUDA_TASK_DIR)/*.cc)
+TASK_OBJS = $(TASK_SRCS:.cc=.o)
+
+CUDA_SRCS_TMP = $(wildcard $(CUDA_TASK_DIR)/*.cu)
+CUDA_SRCS_EXCLUDE = # 除外するファイルを書く
+CUDA_SRCS = $(filter-out $(CUDA_TASK_DIR)/$(CUDA_SRCS_EXCLUDE),$(CUDA_SRCS_TMP))
+CUDA_OBJS = $(CUDA_SRCS:.cu=.ptx)
+
+CFLAGS += -D__CERIUM_CUDA__
+LIBS += `sdl-config --libs` -lCudaManager -F/Library/Frameworks -framework CUDA
+
+INCLUDE += -I$(CUDA_PATH)
+
+NVCC = nvcc
+NVCCFLAGS = -ptx -arch=sm_20
+
+.SUFFIXES: .cc .o .cu .ptx
+
+.cc.o:
+	$(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@
+
+.cu.ptx:
+	$(NVCC) $(NVCCFLAGS) $< -o $@
+
+all: $(TARGET) $(CUDA_OBJS)
+
+$(TARGET): $(OBJS) $(TASK_OBJS) $(CUDA_OBJS)
+	$(CC) -o $@ $(OBJS) $(TASK_OBJS) $(LIBS)
+
+link:
+	$(CC) -o $(TARGET) $(OBJS) $(TASK_OBJS) $(LIBS)
+
+debug: $(TARGET)
+	sudo ppu-gdb ./$(TARGET) 
+
+clean:
+	rm -f $(TARGET) $(OBJS) $(TASK_OBJS) $(CUDA_OBJS)
+	rm -f *~ \#*
+	rm -f cuda/*~ cuda/\#*
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/synthesizer/Makefile.def	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,18 @@
+TARGET = synthesizer
+
+# include/library path
+# ex  macosx
+#CERIUM = /Users/gongo/Source/Cerium
+ABIBIT=64
+
+# ex  linux/ps3
+CERIUM = ../../../Cerium
+
+
+OPT =  -g -O0
+
+CC      = clang++
+CFLAGS = -Wall `sdl-config --cflags` -m$(ABIBIT)   $(OPT) #-DDEBUG
+
+INCLUDE = -I${CERIUM}/include/TaskManager -I. -I..
+LIBS = -L${CERIUM}/TaskManager
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/synthesizer/Makefile.gpu	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,62 @@
+include ./Makefile.def
+
+SRCS_TMP = $(wildcard *.cc)
+SRCS_EXCLUDE = # 除外するファイルを書く
+SRCS = $(filter-out $(SRCS_EXCLUDE),$(SRCS_TMP))
+OBJS = $(SRCS:.cc=.o)
+
+TASK_DIR1  = ppe
+TASK_DIR2  = gpu
+TASK_SRCS_TMP = $(wildcard $(TASK_DIR2)/*.cc $(TASK_DIR1)/*.cc)
+TASK_SRCS_EXCLUDE = # Exec.cc
+TASK_SRCS = $(filter-out $(TASK_DIR1)/$(TASK_SRCS_EXCLUDE),$(TASK_SRCS_TMP))
+TASK_OBJS = $(TASK_SRCS:.cc=.o)
+
+CC += $(ABI)
+CFLAGS  += -D__CERIUM_GPU__
+
+INCLUDE = -I${CERIUM}/include/TaskManager -I. -I..
+LIBS = -L${CERIUM}/TaskManager -DUSE_SIMPLE_TASK -lGpuManager -framework opencl `sdl-config --libs`
+
+.SUFFIXES: .cc .o
+
+.cc.o:
+	$(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@
+
+all: $(TARGET)
+
+$(TARGET): $(OBJS) $(TASK_OBJS)
+	$(CC) -o $@ $(OBJS) $(TASK_OBJS) $(LIBS)
+
+link:
+	$(CC) -o $(TARGET) $(OBJS) $(TASK_OBJS) $(LIBS)
+
+debug: $(TARGET)
+	sudo lldb -- ./$(TARGET) -file c.txt -gpu -g
+
+test :
+	./$(TARGET) -file c.txt -gpu -g
+
+clean:
+	rm -f $(TARGET) $(OBJS) $(TASK_OBJS)
+	rm -f *~ \#*
+	rm -f ppe/*~ ppe/\#*
+	rm -f spe/*~ spe/\#*
+	rm -f gpu/*~ gpu/\#*
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/synthesizer/Makefile.linux	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,36 @@
+include ./Makefile.def
+
+SRCS_TMP = $(wildcard *.cc)
+SRCS_EXCLUDE =  # 除外するファイルを書く
+SRCS = $(filter-out $(SRCS_EXCLUDE),$(SRCS_TMP))
+OBJS = $(SRCS:.cc=.o)
+
+TASK_DIR  = ppe
+TASK_SRCS_TMP = $(wildcard $(TASK_DIR)/*.cc)
+TASK_SRCS_EXCLUDE = 
+TASK_SRCS = $(filter-out $(TASK_DIR)/$(TASK_SRCS_EXCLUDE),$(TASK_SRCS_TMP))
+TASK_OBJS = $(TASK_SRCS:.cc=.o)
+
+LIBS += -lFifoManager -lrt
+
+.SUFFIXES: .cc .o
+
+.cc.o:
+	$(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@
+
+all: $(TARGET)
+
+$(TARGET): $(OBJS) $(TASK_OBJS)
+	$(CC) -o $@ $(OBJS) $(TASK_OBJS) $(LIBS)
+
+link:
+	$(CC) -o $(TARGET) $(OBJS) $(TASK_OBJS) $(LIBS)
+
+debug: $(TARGET)
+	sudo gdb ./$(TARGET) 
+
+clean:
+	rm -f $(TARGET) $(OBJS) $(TASK_OBJS)
+	rm -f *~ \#*
+	rm -f ppe/*~ ppe/\#*
+	rm -f spe/*~ spe/\#*
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/synthesizer/Makefile.macosx	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,41 @@
+include ./Makefile.def
+
+SRCS_TMP = $(wildcard *.cc)
+SRCS_EXCLUDE =  # 除外するファイルを書く
+SRCS = $(filter-out $(SRCS_EXCLUDE),$(SRCS_TMP))
+OBJS = $(SRCS:.cc=.o)
+
+TASK_DIR  = ppe
+TASK_SRCS_TMP = $(wildcard $(TASK_DIR)/*.cc)
+TASK_SRCS_EXCLUDE = 
+TASK_SRCS = $(filter-out $(TASK_DIR)/$(TASK_SRCS_EXCLUDE),$(TASK_SRCS_TMP))
+TASK_OBJS = $(TASK_SRCS:.cc=.o)
+
+LIBS += -lFifoManager `sdl-config --libs`
+CC += -m$(ABIBIT)
+
+.SUFFIXES: .cc .o
+
+.cc.o:
+	$(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@
+
+all: $(TARGET)
+
+$(TARGET): $(OBJS) $(TASK_OBJS)
+	$(CC) -o $@ $(OBJS) $(TASK_OBJS) $(LIBS)
+
+link:
+	$(CC) -o $(TARGET) $(OBJS) $(TASK_OBJS) $(LIBS)
+
+debug: $(TARGET)
+	sudo gdb ./$(TARGET) 
+
+test:
+	./$(TARGET) -file c.txt -cpu 1
+	./$(TARGET) -file c.txt -cpu 4
+	./$(TARGET) -file c.txt -cpu 4 -i
+clean:
+	rm -f $(TARGET) $(OBJS) $(TASK_OBJS)
+	rm -f *~ \#*
+	rm -f ppe/*~ ppe/\#*
+	rm -f spe/*~ spe/\#*
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/synthesizer/README	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,19 @@
+
+word count
+
+     16     16     16
+   |------|------|------|------|------|------|------|------|
+
+と言うように実行すると、
+   /\/\/\/\
+   \/\/\/\/
+patter になってしまう。
+
+
+     16     16
+   |------|------| 16
+          |------|------| 16
+                 |------|------|  16
+                        |------|------|
+
+となるようにしたい。run16 は逐次で実行されるので、二つspawnすれば良い?
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/synthesizer/main.cc	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,77 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include "TaskManager.h"
+#include "SchedTask.h"
+#include "Func.h"
+#include "AudioData.h"
+
+#include <sdl.h>
+#include <SDL_audio.h>
+#include <math.h>
+
+/*
+ * PS3でCPU数が2以上の時に、あまりが計算されてない
+ */
+
+extern void task_init();
+CPU_TYPE spe_cpu = SPE_ANY;
+int volume = 3000;
+double  Frequency = 440;
+char    *waveform_name;
+
+const char *usr_help_str = "Usage: ./word_count [-a -c -s] [-cpu spe_num] [-g] [-file filename] [-br]\n";
+
+static void
+run_start(TaskManager *manager)
+{
+
+    AudioDataPtr au = (AudioDataPtr)manager->allocate(sizeof(AudioData));
+    au->self = au;
+
+    au->volume = volume;
+    au->Frequency = Frequency;
+    au->waveform_name = waveform_name;
+
+    printf("Freq:%f\n",Frequency);
+    au->freq= 44100; /* Sampling rate: 44100Hz */
+    au->format= AUDIO_S16LSB; /* 16-bit signed audio */
+    au->channels= 1; /* Mono */
+    au->samples= 8192; /* Buffer size: 8K = 0.37 sec. */
+    // au->callback= callback;
+    au->userdata= NULL;
+
+    HTaskPtr osc = manager->create_task(OSC_TASK);
+    osc->set_inData(0,(memaddr)&au->self,sizeof(memaddr));
+    osc->spawn();
+}
+
+void
+init(int argc, char **argv)
+{
+    for (int i = 1; argv[i]; ++i) {
+        if (strcmp(argv[i], "-freq") == 0) {
+            Frequency = atof(argv[i+1]);
+        } else if (strcmp(argv[i], "-wave") == 0) {
+            waveform_name = argv[i+1];
+        } else if (strcmp(argv[i], "-vol") == 0) {
+            volume = atof(argv[i+1]);
+        }
+    }
+}
+
+
+int
+TMmain(TaskManager *manager, int argc, char *argv[])
+{
+    task_init();
+    run_start(manager);
+    return 0;
+}
+/* end */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/synthesizer/ppe/OSC.cc	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,78 @@
+#include <stdio.h>
+#include <string.h>
+#include "AudioData.h"
+#include "Func.h"
+#include "OSC.h"
+
+#include <sdl.h>
+#include <SDL_audio.h>
+#include <math.h>
+/* これは必須 */
+SchedDefineTask1(Audioosc,osc);
+
+int gvolume = 3000;
+double  gFrequency = 440;
+char    *gwaveform_name;
+SDL_AudioSpec Desired;
+SDL_AudioSpec Obtained;
+
+double square(double t){
+    double decimal_part = t - abs(t);
+    return decimal_part < 0.5 ? 1 : -1;
+}
+
+double tri(double t){
+
+    double decimal_part = t - abs(t);
+
+    if(abs(t) % 2 != 0){
+        return decimal_part < 0.5 ? decimal_part : 1 - decimal_part;
+    }else{
+        return decimal_part < 0.5 ? -decimal_part : 1 - decimal_part;
+    }
+}
+
+
+void callback(void *unused,Uint8 *stream,int len){
+    static unsigned int step = 0;
+    Uint16 *frames = (Uint16 *) stream;
+    int framesize = len / 2;
+
+    if(strcmp(gwaveform_name, "tri")){
+
+        for (int i = 0; i < framesize ; i++, step++){
+            frames[i] = tri(step * gFrequency / Obtained.freq) * gvolume ;
+        }
+
+    }else if(strcmp(gwaveform_name, "sqr")){
+
+        for (int i = 0; i < framesize ; i++, step++){
+            frames[i] = square(step * gFrequency / Obtained.freq) * gvolume ;
+        }
+
+    }
+}
+
+static int
+osc(SchedTask *s, void *rbuf, void *wbuf)
+{
+    AudioData *i_data = (AudioDataPtr)s->get_input(0);
+
+    gvolume = i_data->volume;
+    gwaveform_name = i_data->waveform_name;
+    gFrequency = i_data->Frequency;
+
+    Desired.freq= i_data->freq; /* Sampling rate: 44100Hz */
+    Desired.format= i_data->format; /* 16-bit signed audio */
+    Desired.channels= i_data->channels; /* Mono */
+    Desired.samples= i_data->samples; /* Buffer size: 8K = 0.37 sec. */
+    Desired.callback= callback;
+    Desired.userdata= i_data->userdata;
+
+    SDL_OpenAudio(&Desired, &Obtained);
+    SDL_PauseAudio(0);
+    SDL_Delay(200);
+    SDL_Quit();
+
+    return 0;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/synthesizer/ppe/OSC.h	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,9 @@
+#ifndef INCLUDED_TASK_HELLO
+#define INCLUDED_TASK_HELLO
+
+#ifndef INCLUDED_SCHED_TASK
+#  include "SchedTask.h"
+#endif
+
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/synthesizer/spe/Exec.cc	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,45 @@
+#include <stdio.h>
+#include <string.h>
+#include "Exec.h"
+#include "Func.h"
+
+/* これは必須 */
+SchedDefineTask1(Exec,wordcount);
+
+static int
+wordcount(SchedTask *s, void *rbuf, void *wbuf)
+{
+    char *i_data = (char *)rbuf;
+    unsigned long long *o_data = (unsigned long long*)wbuf;
+    unsigned long long *head_tail_flag = o_data +2;
+    int length = (int)s->get_inputSize(0);
+    int word_flag = 0;
+    int word_num = 0;
+    int line_num = 0;
+    int i = 0;
+
+    head_tail_flag[0] = (i_data[0] != 0x20) && (i_data[0] != 0x0A);
+    word_num -= 1-head_tail_flag[0];
+
+    for (; i < length; i++) {
+	if (i_data[i] == 0x20) {
+	    word_flag = 1;
+	} else if (i_data[i] == 0x0A) {
+	    line_num += 1;
+	    word_flag = 1;
+	} else {
+	    word_num += word_flag;
+	    word_flag = 0;
+	}
+    }
+
+    word_num += word_flag;
+    head_tail_flag[1] = (i_data[i-1] != 0x20) && (i_data[i-1] != 0x0A);
+
+    // s->printf("SPE word %d line %d\n",word_num,line_num);
+
+    o_data[0] = (unsigned long long)word_num;
+    o_data[1] = (unsigned long long)line_num;
+
+    return 0;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/synthesizer/spe/Exec.h	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,9 @@
+#ifndef INCLUDED_TASK_HELLO
+#define INCLUDED_TASK_HELLO
+
+#ifndef INCLUDED_SCHED_TASK
+#  include "SchedTask.h"
+#endif
+
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/synthesizer/spe/Makefile	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,26 @@
+include ../Makefile.def
+
+TARGET = ../spe-main
+
+SRCS_TMP = $(wildcard *.cc)
+SRCS = $(filter-out $(SRCS_EXCLUDE),$(SRCS_TMP))
+OBJS = $(SRCS:.cc=.o)
+
+CC      = spu-g++
+CFLAGS  = -Wall -fno-exceptions -fno-rtti $(OPT) 
+INCLUDE = -I../${CERIUM}/include/TaskManager -I. -I..
+LIBS = -L../${CERIUM}/TaskManager -lspemanager
+
+.SUFFIXES: .cc .o
+
+.cc.o:
+	$(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@
+
+all: $(TARGET)
+
+$(TARGET): $(OBJS)
+	$(CC) -o $@ $(OBJS) $(TASK_OBJS) $(LIBS)
+
+clean:
+	rm -f $(TARGET) $(OBJS)
+	rm -f *~ \#*
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/synthesizer/task_init.cc	Wed Jul 16 01:28:20 2014 +0900
@@ -0,0 +1,22 @@
+#include "Func.h"
+#include "Scheduler.h"
+#ifdef __CERIUM_GPU__
+#include "GpuScheduler.h"
+#endif
+#ifdef __CERIUM_CUDA__
+#include "CudaScheduler.h"
+#endif
+
+/* 必ずこの位置に書いて */
+SchedExternTask(Audioosc);
+
+/**
+ * この関数は ../spe/spe-main と違って
+ * 自分で呼び出せばいい関数なので
+ * 好きな関数名でおk (SchedRegisterTask は必須)
+ */
+void
+task_init(void)
+{
+    SchedRegisterTask(OSC_TASK, Audioosc);
+}