changeset 1524:32305a19a380 draft cell

merge branch
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Wed, 14 Nov 2012 09:45:28 +0900
parents 30145272ff0b (current diff) d232231e1425 (diff)
children 23f8034d8100 67a2da98d95c
files TaskManager/kernel/ppe/CpuThreads.cc example/many_task/ppe/QuickSort.cc.loop example/many_task/ppe/task_init.cc
diffstat 44 files changed, 814 insertions(+), 313 deletions(-) [+]
line wrap: on
line diff
--- a/TaskManager/Fifo/PreRefDmaManager.cc	Tue Oct 02 18:17:34 2012 +0900
+++ b/TaskManager/Fifo/PreRefDmaManager.cc	Wed Nov 14 09:45:28 2012 +0900
@@ -3,7 +3,7 @@
 #include "TaskManagerImpl.h"
 
 void *
-PreRefDmaManager::dma_load(Scheduler *s, void *buf, memaddr addr, uint32 size, uint32 mask)
+PreRefDmaManager::dma_load(Scheduler *s, memaddr addr, uint32 size, uint32 mask)
 {
 	unsigned long long wait = 0;
 	(this->*start_dmawait_profile)();
--- a/TaskManager/Fifo/PreRefDmaManager.h	Tue Oct 02 18:17:34 2012 +0900
+++ b/TaskManager/Fifo/PreRefDmaManager.h	Wed Nov 14 09:45:28 2012 +0900
@@ -7,7 +7,7 @@
 
 public:
 	/* functions */
-    virtual void *dma_load(Scheduler *s, void *buf, memaddr addr, uint32 size, uint32 mask);
+    virtual void *dma_load(Scheduler *s, memaddr addr, uint32 size, uint32 mask);
     virtual void free_(void *buf);
 } ;
 
--- a/TaskManager/Fifo/gettime.h	Tue Oct 02 18:17:34 2012 +0900
+++ b/TaskManager/Fifo/gettime.h	Wed Nov 14 09:45:28 2012 +0900
@@ -10,8 +10,8 @@
  */
 inline unsigned long long gettime() {
 
-	unsigned long long time;
-#ifdef __CERIUM_FIFO__
+	unsigned long long time = 0;
+#ifdef __CERIUM_FIFO__  // ??
 	struct timespec ts;
 
 #ifndef __APPLE__
--- a/TaskManager/Fifo/rdtsc.h	Tue Oct 02 18:17:34 2012 +0900
+++ b/TaskManager/Fifo/rdtsc.h	Wed Nov 14 09:45:28 2012 +0900
@@ -6,8 +6,8 @@
  */
 
 inline unsigned long long rdtsc() {
-	unsigned long long ret;
-#ifdef __CERIUM_FIFO__
+	unsigned long long ret = 0;
+#ifdef __CERIUM_FIFO__ // ??
 	__asm__ volatile ("rdtsc" : "=A" (ret));
 #endif // __CERIUM_FIFO__
 	return ret;
--- a/TaskManager/Gpu/GpuScheduler.cc	Tue Oct 02 18:17:34 2012 +0900
+++ b/TaskManager/Gpu/GpuScheduler.cc	Wed Nov 14 09:45:28 2012 +0900
@@ -3,6 +3,8 @@
 #include "PreRefDmaManager.h"
 #include "SchedTask.h"
 #include "stdio.h"
+//
+#include "ListData.h"
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <string.h>
@@ -82,15 +84,14 @@
                 load_kernel(nextTask->command);
 
                 cl_kernel& kernel = *task_list[nextTask->command].gputask->kernel;
-                int err = CL_SUCCESS;
 
                 int param = 0;
 
                 cl_mem memparam = clCreateBuffer(context, CL_MEM_READ_ONLY,
-                                               sizeof(memaddr)*nextTask->param_count, NULL, NULL);
-                err |= clEnqueueWriteBuffer(command_queue[cur], memparam, CL_TRUE, 0, sizeof(memaddr)*nextTask->param_count,
-                                            nextTask->param(0), 0, NULL, NULL);
-                err |= clSetKernelArg(kernel, param, sizeof(memaddr),(void *)&memparam);
+                                                 sizeof(memaddr)*nextTask->param_count, NULL, NULL);
+                ret = clEnqueueWriteBuffer(command_queue[cur], memparam, CL_TRUE, 0,
+                                           sizeof(memaddr)*nextTask->param_count,nextTask->param(0), 0, NULL, NULL);
+                ret = clSetKernelArg(kernel, param, sizeof(memaddr),(void *)&memparam);
 
                 param++;
 
@@ -99,10 +100,10 @@
                 if (!flag.flip) {
                     for(int i=0;i<nextTask->inData_count;i++) {
                         memin[i] = clCreateBuffer(context, mem_flag, nextTask->inData(i)->size, NULL, NULL);
-
-                        err |= clEnqueueWriteBuffer(command_queue[cur], memin[i], CL_TRUE, 0,
-                                                    nextTask->inData(i)->size, nextTask->inData(i)->addr, 0, NULL, NULL);
-                        err |= clSetKernelArg(kernel,  param, sizeof(memaddr), (void *)&memin[i]);
+                        ListElement *input_buf = nextTask->inData(i);
+                        ret = clEnqueueWriteBuffer(command_queue[cur], memin[i], CL_TRUE, 0,
+                                                   input_buf->size, input_buf->addr, 0, NULL, NULL);
+                        ret = clSetKernelArg(kernel,  param, sizeof(memaddr), (void *)&memin[i]);
 
                         param++;
                     }
@@ -111,39 +112,52 @@
                 cl_mem *memout = new cl_mem[nextTask->outData_count];
                 cl_mem_flags out_mem_flag = flag.flip? CL_MEM_READ_WRITE : CL_MEM_WRITE_ONLY;
 
+
                 for(int i=0;i<nextTask->outData_count;i++) {
-                    memout[i] = clCreateBuffer(context, out_mem_flag, nextTask->outData(i)->size, NULL, NULL);
+                    ListElement *output_buf = nextTask->outData(i);
+                    memout[i] = clCreateBuffer(context, out_mem_flag, output_buf->size, NULL, &ret);
+
                     if (flag.flip) { // use output buffer as input buffer
-                        err |= clEnqueueWriteBuffer(command_queue[cur], memout[i], CL_TRUE, 0,
-                                                    nextTask->inData(i)->size, nextTask->inData(i)->addr, 0, NULL, NULL);                    }
-                    err |= clSetKernelArg(kernel,  param, sizeof(memaddr), (void *)&memout[i]);
+                        ListElement *input_buf = nextTask->inData(i);
+
+                        ret = clEnqueueWriteBuffer(command_queue[cur], memout[i], CL_TRUE, 0,
+                                                   input_buf->size, input_buf->addr, 0, NULL, NULL);
+                    }
+                    ret = clSetKernelArg(kernel,  param, sizeof(memaddr), (void *)&memout[i]);
                     param++;
                 }
 
+
                 cl_event ev = NULL;
-                clEnqueueTask(command_queue[cur], kernel, 0, NULL, &ev);
-
+                ret = clEnqueueTask(command_queue[cur], kernel, 0, NULL, &ev);
                 // ndrange flagが0ならdim,global_work_size[0],local_work_size[0] = 1で固定に
                 // clEnqueueNDRange
                 // (command_queue[cur], kernel, dim, NULL,global_work_size[0],local_work_size[0],NULL&ev);
                 for(int i=0;i<nextTask->outData_count;i++) {
-                    err |= clEnqueueReadBuffer(command_queue[cur], memout[i], CL_TRUE, 0,
-                                               nextTask->outData(i)->size, nextTask->outData(i)->addr, 1, &ev, NULL);
+                    ListElement *output_buf = nextTask->outData(i);
+                    ret = clEnqueueReadBuffer(command_queue[cur], memout[i], CL_TRUE, 0,
+                                              output_buf->size, output_buf->addr, 1, &ev, NULL);
+
                 }
             }
 
             reply[cur] = (memaddr)tasklist->waiter;
-            clFlush(command_queue[1-cur]); // waiting for queued task
-            if(reply[1-cur]) {
-                connector->mail_write(reply[1-cur]);
+            clFlush(command_queue[cur]); // waiting for queued task
+            //clFinish(command_queue[cur]); // waiting for queued task
+            // pipeline    : 1-cur
+            // no pipeline : cur
+
+            if(reply[cur]) {
+                connector->mail_write(reply[cur]);
             }
 
             params_addr = (memaddr)tasklist->next;
             cur = 1 - cur;
         }
-
-        clFlush(command_queue[1-cur]); // waiting for queued task
-        connector->mail_write(reply[1-cur]);
+        /*
+          clFlush(command_queue[1-cur]); // waiting for queued task
+          connector->mail_write(reply[1-cur]);
+        */
 
         connector->mail_write((memaddr)MY_SPE_STATUS_READY);
     }
@@ -165,29 +179,29 @@
     const char *filename = (const char *)task_list[cmd].gputask->kernel;
     const char *functionname = task_list[cmd].name;
 
-    int fp;
+    int fd;
     char *source_str;
     size_t source_size;
 
-    fp = open(filename, O_RDONLY);
+    fd = open(filename, O_RDONLY);
 
-    if (!fp) {
-        fprintf(stderr, "Failed to load kernel.\n");
+    if (fd<0) {
+        fprintf(stderr, "Failed to load kernel %s.\n",filename);
         exit(1);
     }
 
     struct stat stats;
-    fstat(fp,&stats);
+    fstat(fd,&stats);
     off_t size = stats.st_size;
 
-    if (!size) {
+    if (size<=0) {
         fprintf(stderr, "Failed to load kernel.\n");
         exit(1);
     }
 
     source_str = (char*)alloca(size);
-    source_size = read(fp, source_str, size);
-    close(fp);
+    source_size = read(fd, source_str, size);
+    close(fd);
 
     cl_program program =
         clCreateProgramWithSource(context, 1,
@@ -205,7 +219,6 @@
 void
 gpu_register_task(int cmd, const char* filename, const char* functionname)
 {
-    task_list[cmd].gputask->a = 1;
     task_list[cmd].run = not_ready;  // not yet ready
     task_list[cmd].load = null_loader;
     task_list[cmd].wait = null_loader;
@@ -213,4 +226,12 @@
     task_list[cmd].gputask->kernel = (cl_kernel *) filename;
 }
 
+void
+gpu_register_ndrange(int cmd, int dim, size_t* l_work_size)
+{
+    task_list[cmd].gputask->dim = dim;
+    task_list[cmd].gputask->l_work_size = l_work_size;
+
+}
+
 /* end */
--- a/TaskManager/Gpu/GpuScheduler.h	Tue Oct 02 18:17:34 2012 +0900
+++ b/TaskManager/Gpu/GpuScheduler.h	Wed Nov 14 09:45:28 2012 +0900
@@ -45,9 +45,14 @@
 
 };
 
-extern void gpu_register_task(int cmd,const char* filename,const char* functionname);
+#define GpuSchedRegister(str, filename, functionname)   \
+    gpu_register_task(str, filename, functionname);
+
+#define GpuNDRangeRegister(str, dimension, g_worksizePtr)   \
+    gpu_register_ndrange(str, dimension, g_worksizePtr);
 
 #endif
 
-#define GpuSchedRegister(str, filename, functionname)   \
-    gpu_register_task(str, filename, functionname);
+extern void gpu_register_task(int cmd,const char* filename,const char* functionname);
+extern void gpu_register_ndrange(int, int, size_t*);
+
--- a/TaskManager/Gpu/GpuThreads.cc	Tue Oct 02 18:17:34 2012 +0900
+++ b/TaskManager/Gpu/GpuThreads.cc	Wed Nov 14 09:45:28 2012 +0900
@@ -30,11 +30,11 @@
 void
 GpuThreads::init()
 {
-    args->scheduler = new GpuScheduler();    
+    args->scheduler = new GpuScheduler();
     args->useRefDma = use_refdma;
 
     pthread_create(&threads[0], NULL, &gpu_thread_run, args);
-    
+
 }
 
 void *
@@ -42,10 +42,10 @@
 {
     gpu_thread_arg_t *argt = (gpu_thread_arg_t *) args;
     Scheduler *g_scheduler = argt->scheduler;
-    
+
     TaskManagerImpl *manager = new SpeTaskManagerImpl();
     g_scheduler->init(manager, argt->useRefDma);
-    
+
     manager->set_scheduler(g_scheduler);
 
     argt->wait->sem_v();
--- a/TaskManager/Makefile.def	Tue Oct 02 18:17:34 2012 +0900
+++ b/TaskManager/Makefile.def	Wed Nov 14 09:45:28 2012 +0900
@@ -33,13 +33,13 @@
 
 ABIBIT = 64
 
-OPT = -g -DMAIL_QUEUE -DNOT_CHECK -DTASK_LIST_MAIL #-DEARLY_TOUCH -DUSE_CACHE
-#OPT = -O9 -DMAIL_QUEUE -DNOT_CHECK #-DTASK_LIST_MAIL #-DEARLY_TOUCH -DUSE_CACHE 
+OPT = -g -O0 -DMAIL_QUEUE -DNOT_CHECK -DTASK_LIST_MAIL #-DEARLY_TOUCH -DUSE_CACHE
+#OPT = -O9 -DMAIL_QUEUE -DNOT_CHECK #-DTASK_LIST_MAIL #-DEARLY_TOUCH -DUSE_CACHE
 
 
 
 
-CC     = g++   
+CC     = clang++
 CFLAGS = -Wall `sdl-config --cflags` -m$(ABIBIT)   $(OPT)
 LIBS   =  -m$(ABIBIT)
 
--- a/TaskManager/kernel/ppe/CpuThreads.cc	Tue Oct 02 18:17:34 2012 +0900
+++ b/TaskManager/kernel/ppe/CpuThreads.cc	Wed Nov 14 09:45:28 2012 +0900
@@ -9,7 +9,7 @@
 #include "SchedNop.h"
 #include "SpeTaskManagerImpl.h"
 #include "CellScheduler.h"
-
+#include <fcntl.h>
 
 SchedExternTask(ShowTime);
 SchedExternTask(StartProfile);
@@ -39,7 +39,7 @@
     }
 
     for (int i = 0; i < cpu_num; i++) {
-    	delete args[i].scheduler;
+        delete args[i].scheduler;
     }
 
     delete [] threads;
@@ -64,7 +64,7 @@
     SchedRegister(ShowTime);
     SchedRegister(StartProfile);
 
-    argt->wait->sem_v();	//準備完了したスレッドができるたびに+1していく
+    argt->wait->sem_v();        //準備完了したスレッドができるたびに+1していく
 
     c_scheduler->run(new SchedNop());
     c_scheduler->finish();
@@ -90,12 +90,12 @@
     }
 
     for (int i = 0; i < cpu_num; i++) {
-	pthread_create(&threads[i], NULL,
-		      &cpu_thread_run, (void*)&args[i]);
+        pthread_create(&threads[i], NULL,
+                      &cpu_thread_run, (void*)&args[i]);
     }
 
     for (int i = 0; i < cpu_num; i++) {
-    	wait->sem_p();
+        wait->sem_p();
     }
 }
 
@@ -109,7 +109,7 @@
  */
 int
 CpuThreads::get_mail(int cpuid, int count, memaddr *ret)
-{  
+{
 #ifdef __CERIUM_GPU__
     if (is_gpu(cpuid)) return gpu->get_mail(cpuid, count, ret);
 #endif
@@ -128,7 +128,7 @@
     } else {
         return 0; //mailがないとき0を返す
     }
-    
+
 }
 /**
  * Inbound Mailbox
--- a/TaskManager/kernel/schedule/SchedTask.cc	Tue Oct 02 18:17:34 2012 +0900
+++ b/TaskManager/kernel/schedule/SchedTask.cc	Wed Nov 14 09:45:28 2012 +0900
@@ -134,7 +134,7 @@
     if (outListData.bound != dout) free(outListData.bound);
 #ifdef TASK_LIST_MAIL
     if ((cur_index->next() >= list->last()) )
-	connector->mail_write(waiter);
+        connector->mail_write(waiter);
 #else
     connector->mail_write(waiter);
 #endif
@@ -147,28 +147,28 @@
 
     if (cur_index == 0) { // 最初の一つ
         SchedTask *nextSched = new SchedTask();
-	nextSched->init(list, &list->tasks[0], scheduler, this->tag^1);
-	return nextSched;
+        nextSched->init(list, &list->tasks[0], scheduler, this->tag^1);
+        return nextSched;
     }
     TaskPtr nextTask = cur_index->next();
     if (nextTask < list->last()) {
-	// Task List が残っているので、次を準備
+        // Task List が残っているので、次を準備
+
+        TaskPtr nextTask = cur_index->next();
 
-	TaskPtr nextTask = cur_index->next();
-	
         SchedTask *nextSched = new SchedTask();
-	nextSched->init(list, nextTask, scheduler, this->tag^1);
-	return nextSched;
+        nextSched->init(list, nextTask, scheduler, this->tag^1);
+        return nextSched;
     } else {
         memaddr nextList = (memaddr)list->next;
         if (nextList == 0) {
-	    // もう何もする必要がない
-	    
+            // もう何もする必要がない
+
             return new SchedNop2Ready(scheduler);
         } else {
-	    // 新しいリストに取り掛かる
-  	    int dma_tag_switch = 0;
-	    return new SchedTaskList(nextList, scheduler, dma_tag_switch);
+            // 新しいリストに取り掛かる
+            int dma_tag_switch = 0;
+            return new SchedTaskList(nextList, scheduler, dma_tag_switch);
         }
     }
 }
@@ -388,7 +388,7 @@
     return manager->create_task(cmd, __builtin_return_address(0));
 }
 
-HTaskPtr 
+HTaskPtr
 SchedTask::create_task(int cmd, memaddr r, long rs, memaddr w, long ws)
 {
     return manager->create_task(cmd,r,rs,w,ws, __builtin_return_address(0));
@@ -422,12 +422,12 @@
     manager->set_task_cpu(t, cpu);
 }
 
-void* SchedTask::allocate(int size) 
+void* SchedTask::allocate(int size)
 {
     return manager->allocate(size) ;
 }
 
-void* SchedTask::allocate(int size,int align) 
+void* SchedTask::allocate(int size,int align)
 {
     return manager->allocate(size,align) ;
 }
@@ -437,14 +437,14 @@
     manager->polling();
 }
 
-Scheduler* SchedTask::get_scheduler() 
+Scheduler* SchedTask::get_scheduler()
 {
     return scheduler;
 }
 
 /* system call */
 
-int 
+int
 SchedTask::printf(const char * format, ...)
 {
     va_list ap;
--- a/TaskManager/kernel/schedule/Scheduler.h	Tue Oct 02 18:17:34 2012 +0900
+++ b/TaskManager/kernel/schedule/Scheduler.h	Wed Nov 14 09:45:28 2012 +0900
@@ -37,6 +37,8 @@
 typedef struct gpu_task_object {
 #ifdef __CERIUM_GPU__
     cl_kernel *kernel;
+    int dim;
+    size_t *l_work_size;
 #endif
 } GpuTaskObject;
 
--- a/TaskManager/test/GpuRunTest/GpuFunc.h	Tue Oct 02 18:17:34 2012 +0900
+++ b/TaskManager/test/GpuRunTest/GpuFunc.h	Wed Nov 14 09:45:28 2012 +0900
@@ -1,6 +1,7 @@
 
 enum {
 #include "SysTasks.h"
+    mogyo,
     Twice,
     //    Func1,
 };
--- a/TaskManager/test/GpuRunTest/Makefile	Tue Oct 02 18:17:34 2012 +0900
+++ b/TaskManager/test/GpuRunTest/Makefile	Wed Nov 14 09:45:28 2012 +0900
@@ -5,7 +5,7 @@
 SRCS = $(filter-out $(SRCS_EXCLUDE),$(SRCS_TMP))
 OBJS = $(SRCS:.cc=.o)
 
-LIBS += -lGpuManager  -framework opencl `sdl-config --libs`
+LIBS += -lGpuManager -framework opencl `sdl-config --libs`
 
 .SUFFIXES: .cc .o
 
--- a/TaskManager/test/GpuRunTest/Makefile.def	Tue Oct 02 18:17:34 2012 +0900
+++ b/TaskManager/test/GpuRunTest/Makefile.def	Wed Nov 14 09:45:28 2012 +0900
@@ -2,8 +2,8 @@
 
 CERIUM = ../../../../Cerium
 
-CC      = g++
-CFLAGS  =   -g -Wall  
+CC      = clang++
+CFLAGS  =   -g -Wall -O0
 
 INCLUDE = -I${CERIUM}/include/TaskManager -I. -I../..
 LIBS = -L${CERIUM}/TaskManager
--- a/TaskManager/test/GpuRunTest/task_init.cc	Tue Oct 02 18:17:34 2012 +0900
+++ b/TaskManager/test/GpuRunTest/task_init.cc	Wed Nov 14 09:45:28 2012 +0900
@@ -5,5 +5,8 @@
 task_init(void)
 {
     int cmd = Twice;
+    int dim = 2;
+    size_t *l_work_size = new size_t(dim);
+    GpuNDRangeRegister(cmd, dim, l_work_size);
     GpuSchedRegister(cmd, "twice.cl", "twice");
 }
--- a/example/OpenCL/twice.cl	Tue Oct 02 18:17:34 2012 +0900
+++ b/example/OpenCL/twice.cl	Wed Nov 14 09:45:28 2012 +0900
@@ -3,18 +3,8 @@
       __global int *input_data,
       __global int *output_data)
 {
-    int a = 1;
-    int b = rec(a);
     long count = (long)data_count[0];
     for (int i = 0; i<count; i++) {
-        output_data[i] = b;
+        output_data[i] = input_data[i]*2;
     }
 }
-
-int
-rec(int a)
-{
-    if (a<=1) return a;
-    return rec(a-1)+rec(a-2);
-
-}
--- a/example/basic/Makefile.gpu	Tue Oct 02 18:17:34 2012 +0900
+++ b/example/basic/Makefile.gpu	Wed Nov 14 09:45:28 2012 +0900
@@ -5,7 +5,7 @@
 SRCS = $(filter-out $(SRCS_EXCLUDE),$(SRCS_TMP))
 OBJS = $(SRCS:.cc=.o)
 
-TASK_DIR  = ppe
+TASK_DIR  = gpu
 TASK_SRCS_TMP = $(wildcard $(TASK_DIR)/*.cc)
 TASK_SRCS_EXCLUDE = 
 TASK_SRCS = $(filter-out $(TASK_DIR)/$(TASK_SRCS_EXCLUDE),$(TASK_SRCS_TMP))
--- a/example/basic/main.cc	Tue Oct 02 18:17:34 2012 +0900
+++ b/example/basic/main.cc	Wed Nov 14 09:45:28 2012 +0900
@@ -19,7 +19,7 @@
 {
     printf("%s ---\n", title);
     for (int i = 0; i < size; i++) {
-	printf("%2d ", data[i]);
+        printf("%2d ", data[i]);
     }
     printf("\n");
 }
@@ -57,7 +57,7 @@
     int *data = (int*)manager->allocate(sizeof(int)*length);
 
     for (int i = 0; i < length; i++) {
-	data[i] = i;
+        data[i] = i;
     }
 
     print_data(data, length, "before");
@@ -65,7 +65,7 @@
     /**
      * Create Task
      *   create_task(Task ID);
-     */ 
+     */
     twice = manager->create_task(TWICE_TASK);
     twice->set_cpu(SPE_ANY);
 
@@ -73,6 +73,7 @@
      * Set of Input Data
      *   add_inData(address of input data, size of input data);
      */
+    // twice->set_param(0,(memaddr)&length);
     twice->set_inData(0,data, sizeof(int)*length);
 
     /**
@@ -93,14 +94,14 @@
     twice->set_post(twice_result, (void*)data, 0);
 
     // add Active Queue
-    twice->spawn();    
+    twice->spawn();
 }
 
 int
 TMmain(TaskManager *manager,int argc, char *argv[])
 {
     if (init(argc, argv) < 0) {
-	return -1;
+        return -1;
     }
 
     // Task Register
@@ -108,7 +109,7 @@
     task_init();
 
     for (int i = 0; i < task; ++i) {
-	twice_init(manager);
+        twice_init(manager);
     }
 
     return 0;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/flip/GpuFunc.h	Wed Nov 14 09:45:28 2012 +0900
@@ -0,0 +1,6 @@
+
+enum {
+#include "SysTasks.h"
+    Twice,
+    //    Func1,
+};
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/flip/Makefile	Wed Nov 14 09:45:28 2012 +0900
@@ -0,0 +1,29 @@
+include ./Makefile.def
+
+SRCS_TMP = $(wildcard *.cc)
+SRCS_EXCLUDE =   # 除外するファイルを書く
+SRCS = $(filter-out $(SRCS_EXCLUDE),$(SRCS_TMP))
+OBJS = $(SRCS:.cc=.o)
+
+LIBS += -lGpuManager -framework opencl `sdl-config --libs`
+
+.SUFFIXES: .cc .o
+
+.cc.o:
+	$(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@
+
+all: $(TARGET)
+gpu: all
+
+$(TARGET): $(OBJS)
+	$(CC) -o $@ $(OBJS) $(TASK_OBJS) $(LIBS)
+
+link:
+	$(CC) -o $(TARGET) $(OBJS) $(LIBS)
+
+debug: $(TARGET)
+	sudo gdb ./$(TARGET) 
+
+clean:
+	rm -f $(TARGET) $(OBJS)
+	rm -f *~ \#*
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/flip/Makefile.def	Wed Nov 14 09:45:28 2012 +0900
@@ -0,0 +1,9 @@
+TARGET = fliptest
+
+CERIUM = ../../../Cerium
+
+CC      = g++
+CFLAGS  =   -g -Wall
+
+INCLUDE = -I${CERIUM}/include/TaskManager -I. -I../..
+LIBS = -L${CERIUM}/TaskManager
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/flip/main.cc	Wed Nov 14 09:45:28 2012 +0900
@@ -0,0 +1,114 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include "types.h"
+#include "TaskManager.h"
+#include "GpuFunc.h"
+
+#define DEFAULT 5
+static long int length = DEFAULT;
+static int task = 1;
+int *indata;
+
+extern void task_init(void);
+
+char usr_help_str[] = "GpuRun [length]\n";
+
+void
+print_data(int *data, int size, const char *title)
+{
+    printf("%s ---\n", title);
+    for ( int i = 0; i < size; i++) {
+        printf("%2d ", data[i]);
+    }
+    printf("\n");
+}
+
+/**
+ * タスク終了後の data1, data2 の確認
+ */
+void
+twice_result(SchedTask *s, void *a, void *b)
+{
+    int* data = (int*)a;
+    long length = (long)b;
+    print_data(data, length, "after");
+}
+
+
+int
+init(int argc, char **argv)
+{
+    for (int i = 1; argv[i]; ++i) {
+        if (strcmp(argv[i], "-length") == 0) {
+            length = atoi(argv[++i]);
+        } else if (strcmp(argv[i], "-count") == 0) {
+            task = atoi(argv[++i]);
+        }
+    }
+
+    return 0;
+}
+
+
+void
+tester(int *indata, int *outdata, int num) {
+
+    //チェック
+    int check = 0;
+    for (int c=0; c<num; c++) {
+        if(outdata[c] == indata[c]*2) {
+            check++;
+        }
+    }
+
+    printf("Computed '%d/%d' correct values\n",check,num);
+
+}
+
+
+void
+test(TaskManager *manager) {
+    indata = new int[length];
+
+    for (int c=0; c < length ;c++) {
+        indata[c] = c;
+    }
+
+    print_data(indata, length, "before");
+
+    HTaskPtr twice = manager->create_task(Twice);
+
+    twice->set_param(0, (memaddr)length);
+    twice->set_inData(0, indata, sizeof (int)*length);
+    twice->set_outData(0, indata, sizeof (int)*length);
+    twice->set_cpu(GPU_0);
+    twice->flip();
+
+    /*
+     * set_post() で ppe task を渡せるようにしたい
+     */
+    twice->set_post(twice_result, (void*)indata, (void*)length);
+
+    twice->spawn();
+}
+
+int
+TMmain(TaskManager *manager, int argc, char* argv[])
+{
+    if (init(argc, argv) < 0) {
+        return -1;
+    }
+
+    task_init();
+
+    for (int i = 0; i < task; ++i) {
+        test(manager);
+    }
+
+    return 0;
+}
+
+/* end */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/flip/task_init.cc	Wed Nov 14 09:45:28 2012 +0900
@@ -0,0 +1,12 @@
+#include "GpuFunc.h"
+#include "GpuScheduler.h"
+
+void
+task_init(void)
+{
+    int cmd = Twice;
+    int dim = 2;
+    size_t *l_work_size = new size_t(dim);
+    GpuNDRangeRegister(cmd, dim, l_work_size);
+    GpuSchedRegister(cmd, "twice.cl", "twice");
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/flip/twice.cl	Wed Nov 14 09:45:28 2012 +0900
@@ -0,0 +1,12 @@
+__kernel void
+twice(__constant int *data_count,
+      __global int *input_data)
+      //__global int *output_data)
+{
+    long count = (long)data_count[0];
+        for (int i = 0; i<count; i++) {
+            // output_data[i] = 2*input_data[i];
+            input_data[i] *= 2;
+        }
+
+}
--- a/example/many_task/Func.h	Tue Oct 02 18:17:34 2012 +0900
+++ b/example/many_task/Func.h	Wed Nov 14 09:45:28 2012 +0900
@@ -1,6 +1,7 @@
 enum {
 #include "SysTasks.h"
     QUICK_SORT,
+    QUICK_SORT_LOOP,
     SortSimple,
     SortCompat,
 };
--- a/example/many_task/Makefile	Tue Oct 02 18:17:34 2012 +0900
+++ b/example/many_task/Makefile	Wed Nov 14 09:45:28 2012 +0900
@@ -16,6 +16,10 @@
 	@echo "Make for PS3 (Cell)"
 	@$(MAKE) -f Makefile.cell
 
+gpu: FORCE
+	@echo "Make for OpenCL"
+	@$(MAKE) -f Makefile.gpu
+
 FORCE:
 
 clean:
--- a/example/many_task/Makefile.def	Tue Oct 02 18:17:34 2012 +0900
+++ b/example/many_task/Makefile.def	Wed Nov 14 09:45:28 2012 +0900
@@ -6,11 +6,11 @@
 
 CERIUM = ../../../Cerium
 
-OPT = -O9
-# OPT = -g -O9
+OPT = -O
+OPT = -g -O0
 # OPT = -g
-CC      = g++
-CFLAGS  =  -DUSE_SIMPLE_TASK -Wall  $(OPT)
+CC      = clang++
+CFLAGS  =  -Wall  $(OPT) -DUSE_SIMPLE_TASK
 # CFLAGS  =   -Wall  $(OPT)
 
 INCLUDE = -I${CERIUM}/include/TaskManager -I. -I..
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/many_task/Makefile.gpu	Wed Nov 14 09:45:28 2012 +0900
@@ -0,0 +1,42 @@
+include ./Makefile.def
+
+
+SRCS_TMP = $(wildcard *.cc)
+SRCS_EXCLUDE = sort_test.cc task_init.cc # 除外するファイルを書く
+SRCS = $(filter-out $(SRCS_EXCLUDE),$(SRCS_TMP))
+OBJS = $(SRCS:.cc=.o)
+
+TASK_DIR  = gpu
+TASK_SRCS_TMP = $(wildcard $(TASK_DIR)/*.cc)
+TASK_SRCS_EXCLUDE = sort_test.cc
+TASK_SRCS = $(filter-out $(TASK_DIR)/$(TASK_SRCS_EXCLUDE),$(TASK_SRCS_TMP))
+TASK_OBJS = $(TASK_SRCS:.cc=.o)
+
+CC += $(ABI)
+# CFLAGS  = -g -Wall# -O9 #-DDEBUG
+
+INCLUDE = -I${CERIUM}/include/TaskManager -I. -I..
+LIBS = -L${CERIUM}/TaskManager -DUSE_SIMPLE_TASK -lGpuManager -framework opencl `sdl-config --libs`
+
+.SUFFIXES: .cc .o
+
+.cc.o:
+	$(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@
+
+all: $(TARGET)
+
+$(TARGET): $(OBJS) $(TASK_OBJS)
+	$(CC) -o $@ $(OBJS) $(TASK_OBJS) $(LIBS)
+
+link:
+	$(CC) -o $(TARGET) $(OBJS) $(TASK_OBJS) $(LIBS)
+
+debug: $(TARGET)
+	sudo gdb ./$(TARGET)
+
+clean:
+	rm -f $(TARGET) $(OBJS) $(TASK_OBJS)
+	rm -f *~ \#*
+	rm -f ppe/*~ ppe/\#*
+	rm -f spe/*~ spe/\#*
+	rm -f gpu/*~ gpu/\#*
--- a/example/many_task/Makefile.macosx	Tue Oct 02 18:17:34 2012 +0900
+++ b/example/many_task/Makefile.macosx	Wed Nov 14 09:45:28 2012 +0900
@@ -2,7 +2,7 @@
 
 
 SRCS_TMP = $(wildcard *.cc)
-SRCS_EXCLUDE =  # 除外するファイルを書く
+SRCS_EXCLUDE = sort_test.cc # 除外するファイルを書く
 SRCS = $(filter-out $(SRCS_EXCLUDE),$(SRCS_TMP))
 OBJS = $(SRCS:.cc=.o)
 
@@ -33,7 +33,7 @@
 	$(CC) -o $(TARGET) $(OBJS) $(TASK_OBJS) $(LIBS)
 
 debug: $(TARGET)
-	sudo gdb ./$(TARGET) 
+	sudo gdb ./$(TARGET)
 
 clean:
 	rm -f $(TARGET) $(OBJS) $(TASK_OBJS)
--- a/example/many_task/README	Tue Oct 02 18:17:34 2012 +0900
+++ b/example/many_task/README	Wed Nov 14 09:45:28 2012 +0900
@@ -1,3 +1,6 @@
+2012/10/8 tomari
+ppe内でmakeしたらCeriumを使わないtest routineが走る
+
 2010/7/31 kono
 
 bitoinc sort の一段落を待って、次のtaskを生成する方法だと、
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/many_task/gpu/Makefile	Wed Nov 14 09:45:28 2012 +0900
@@ -0,0 +1,37 @@
+TARGET = sort_test
+CERIUM = ../../../../Cerium
+
+CC      = g++
+CFLAGS  =   -g -Wall  
+
+INCLUDE = -I${CERIUM}/include/TaskManager -I. -I../
+LIBS = -L${CERIUM}/TaskManager
+
+
+SRCS_TMP = $(wildcard *.cc)
+SRCS_EXCLUDE = gpu_task_init.cc # 除外するファイルを書く
+SRCS = $(filter-out $(SRCS_EXCLUDE),$(SRCS_TMP))
+OBJS = $(SRCS:.cc=.o)
+
+LIBS += -framework opencl
+
+.SUFFIXES: .cc .o
+
+.cc.o:
+	$(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@
+
+all: $(TARGET)
+gpu: all
+
+$(TARGET): $(OBJS)
+	$(CC) -o $@ $(OBJS) $(TASK_OBJS) $(LIBS)
+
+link:
+	$(CC) -o $(TARGET) $(OBJS) $(LIBS)
+
+debug: $(TARGET)
+	sudo gdb ./$(TARGET) 
+
+clean:
+	rm -f $(TARGET) $(OBJS)
+	rm -f *~ \#*
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/many_task/gpu/QuickSort.cl	Wed Nov 14 09:45:28 2012 +0900
@@ -0,0 +1,50 @@
+typedef struct Data {
+    int index;
+    int ptr;
+    int pad[2];
+} Data, *DataPtr;
+
+inline void
+swap(__global Data *data, int left, int right )
+{
+    Data tmp    = data[left];
+    data[left]  = data[right];
+    data[right] = tmp;
+}
+
+__kernel void
+quick_sort(__constant int *count,
+           __global Data *data)
+{
+    int begin = 0;
+    int end = count[0];
+
+    int stack[1024];
+    int sp = 0;
+    int p;
+    while (1) {
+        while (begin < end) {
+            int where = (begin + end) / 2;
+            int pivot = data[where].index;
+            data[where].index = data[begin].index;
+            int i;
+            p = begin;
+            for (i=begin+1; i<=end; i++) {
+                if (data[i].index < pivot) {
+                    p++;
+                    swap(data, p, i);
+                }
+            }
+            data[begin].index = data[p].index;
+            data[p].index = pivot;
+
+            stack[sp++] = p + 1;
+            stack[sp++] = end;
+            end = p - 1;
+        }
+        if (sp == 0) return;
+        end = stack[--sp];
+        begin = stack[--sp];
+        begin = p + 1;
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/many_task/gpu/gpu_task_init.cc	Wed Nov 14 09:45:28 2012 +0900
@@ -0,0 +1,12 @@
+#include "Func.h"
+#include "GpuScheduler.h"
+#include "Scheduler.h"
+
+SchedExternTask(SortSimple);
+
+void
+task_init(void)
+{
+    SchedRegister(SortSimple);
+    GpuSchedRegister(QUICK_SORT, "gpu/QuickSort.cl", "quick_sort");
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/many_task/gpu/sort_test.cc	Wed Nov 14 09:45:28 2012 +0900
@@ -0,0 +1,183 @@
+#include <stdlib.h>
+#include <OpenCL/opencl.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+#include "sort.h"
+#include "sort_test.h"
+#define DEFAULT 432
+
+extern int data_length;
+extern DataPtr data;
+
+// 計測用
+static double st_time;
+static double ed_time;
+static int length = DEFAULT;
+
+int
+init(int argc, char **argv)
+{
+    for (int i = 1; argv[i]; ++i) {
+        if (strcmp(argv[i], "--length") == 0 || strcmp(argv[i], "-l") == 0) {
+            length = atoi(argv[++i]);
+        }
+    }
+
+    return 0;
+}
+
+int
+get_split_num(int len, int num)
+{
+    if (len / num < MAX_BLOCK_SIZE) {
+        return num;
+    } else {
+        // 切り上げ
+        return (len + MAX_BLOCK_SIZE - 1) / MAX_BLOCK_SIZE;
+    }
+}
+
+
+static double
+getTime()
+{
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_sec + (double)tv.tv_usec*1e-6;
+}
+
+void
+show( Data *data, int size )
+{
+    puts("-----------------------------------------------");
+    for(int i=0; i<=size; i++) printf("data[%02d].index = %d\n", i, data[i].index);
+    puts("-----------------------------------------------");
+    return;
+}
+
+Sort sorter;
+
+static void
+check_data()
+{
+    for(int i=0; i< sorter.data_length-1;i++) {
+        if (sorter.data[i].index>sorter.data[i+1].index)  {
+            printf("Data are not sorted at %d. %d > %d \n",i, sorter.data[i].index,sorter.data[i+1].index);
+            return;
+        }
+    }
+    printf("Data are sorted\n");
+}
+
+void
+gpu_init()
+{
+    clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
+    clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id,
+                   &ret_num_devices);
+
+    context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
+    command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
+
+    //ファイルオープン
+
+    const char* filename = "QuickSort.cl";
+    const char* functionname = "quick_sort";
+
+    int fp = open(filename, O_RDONLY);
+
+    if (!fp) {
+        fprintf(stderr, "Failed to load kernel.\n");
+        exit(1);
+    }
+
+    struct stat stats;
+    fstat(fp,&stats);
+    off_t size = stats.st_size;
+
+    if (!size) {
+        fprintf(stderr, "Failed to load kernel.\n");
+        exit(1);
+    }
+
+    char *kernel_src_str = new char[size];
+    size_t kernel_code_size = read(fp, kernel_src_str, size);
+    close(fp);
+
+    program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str,
+                                                   (const size_t *)&kernel_code_size, &ret);
+    clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
+    kernel = clCreateKernel(program,functionname, &ret);
+}
+
+void
+sort_start(Sort s){
+
+    Sort sorter = s;
+    int length = sorter.data_length;
+
+    //メモリバッファの作成
+    cl_mem mem_count = clCreateBuffer(context, CL_MEM_READ_ONLY,sizeof(int),NULL, &ret);
+    cl_mem mem_data = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(Data)*length, NULL, &ret);
+
+    st_time = getTime();
+
+    //メモリバッファに入力データを書き込み
+    ret = clEnqueueWriteBuffer(command_queue, mem_count, CL_TRUE, 0,
+                               sizeof(int), &length, 0, NULL, NULL);
+    ret = clEnqueueWriteBuffer(command_queue, mem_data, CL_TRUE, 0,
+                               sizeof(Data)*length, sorter.data, 0, NULL, NULL);
+
+    //print_data(data, count, "before");
+    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&mem_count);
+    clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&mem_data);
+
+    ev = NULL;
+
+    ret = clEnqueueTask(command_queue, kernel, 0, NULL, &ev);
+
+    //メモリバッファから結果を取得
+    ret = clEnqueueReadBuffer(command_queue, mem_data, CL_TRUE, 0,sizeof(Data)*length, sorter.data, 1, &ev, NULL);
+    clFlush(command_queue);
+    ed_time = getTime();
+    show(sorter.data, length-1);
+    check_data();
+    printf("Time: %0.6f\n",ed_time-st_time);
+
+    clReleaseKernel(kernel);
+    clReleaseProgram(program);
+    clReleaseMemObject(mem_data);
+    clReleaseEvent(ev);
+    clReleaseCommandQueue(command_queue);
+    clReleaseContext(context);
+
+}
+
+int main(int argc, char *argv[]) {
+
+    // 無効な引数ならデフォルトの値として432を設定
+
+
+    if (argc>1) {
+        if (init(argc,argv) < 0) {
+            return -1;
+        }
+    }
+
+    gpu_init();
+
+    sorter.data = new Data[length];
+    sorter.data_length = length;
+    sorter.split_num = get_split_num(sorter.data_length, 1); // (length, cpu_num)
+
+    for (int i = 0; i < length; i++) {
+        sorter.data[i].index = rand()%10000;
+        sorter.data[i].ptr   = i;
+    }
+
+    sort_start(sorter);
+    return 0;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/example/many_task/gpu/sort_test.h	Wed Nov 14 09:45:28 2012 +0900
@@ -0,0 +1,11 @@
+    cl_platform_id platform_id;
+    cl_uint ret_num_platforms;
+    cl_device_id device_id;
+    cl_uint ret_num_devices;
+    cl_int ret;
+
+    cl_context context;
+    cl_command_queue command_queue;
+    cl_program program;
+    cl_kernel kernel;
+    cl_event ev;
--- a/example/many_task/main.cc	Tue Oct 02 18:17:34 2012 +0900
+++ b/example/many_task/main.cc	Wed Nov 14 09:45:28 2012 +0900
@@ -25,6 +25,7 @@
 static double ed_time;
 
 static int length = 1200;
+CPU_TYPE spe_cpu = SPE_ANY;
 
 // prototype
 void TMend(TaskManager *);
@@ -36,17 +37,6 @@
     gettimeofday(&tv, NULL);
     return tv.tv_sec + (double)tv.tv_usec*1e-6;
 }
-/*
-static void
-show_data(void)
-{
-    puts("-----------------------------------------------");
-    for(int i = 0; i < data_length; i++) {
-        printf("data[%02d].index = %d\n", i, data[i].index);
-    }
-    puts("-----------------------------------------------");
-}
-*/
 
 const char *usr_help_str = "Usage: ./sort [option]\n \
 options\n\
@@ -64,6 +54,9 @@
         if (strcmp(argv[i], "-a") == 0 ) {
             all = 1;
         }
+        if (strcmp(argv[i], "-g") == 0 ) {
+            spe_cpu = GPU_0;
+        }
         if (strcmp(argv[i], "-c") == 0 ) {
             sort_task = SortCompat;
         }
@@ -79,6 +72,16 @@
 Sort sorter;
 
 static void
+show_data(void)
+{
+    puts("-----------------------------------------------");
+    for(int i = 0; i < sorter.data_length; i++) {
+        printf("data[%02d].index = %d\n", i, sorter.data[i].index);
+    }
+    puts("-----------------------------------------------");
+}
+
+static void
 check_data()
 {
     for(int i=0; i< sorter.data_length-1;i++) {
@@ -111,9 +114,12 @@
         sorter.data[i].ptr   = i;
     }
 
+    // show_data();
     HTaskPtr restart = manager->create_task(sort_task,0,0,0,0);
+    // default ではSortSimpleがsetされている。SortSimpleはsort.ccに
+
     restart->set_param(0,(memaddr)&sorter);
-    //set flip flag
+    // set flip flag
     restart->spawn();
 }
 
@@ -129,7 +135,6 @@
     task_init();
 
     int cpu  = manager->get_cpuNum();
-
     // in case of -cpu 0
     if (cpu==0) cpu = 1;
     if (1) {
@@ -152,7 +157,7 @@
 TMend(TaskManager *manager)
 {
     ed_time = getTime();
-    //show_data();
+    show_data();
     check_data();
     printf("Time: %0.6f\n",ed_time-st_time);
 }
--- a/example/many_task/ppe/Makefile	Tue Oct 02 18:17:34 2012 +0900
+++ b/example/many_task/ppe/Makefile	Wed Nov 14 09:45:28 2012 +0900
@@ -25,6 +25,12 @@
 debug: $(TARGET)
 	sudo gdb ./$(TARGET) 
 
+loop:  $(OBJS) $(TASK_OBJS)
+	$(CC) -o $@ $(OBJS) $(TASK_OBJS) $(LIBS) $(LOOP)
+
+rec:  $(OBJS) $(TASK_OBJS)
+	$(CC) -o $@ $(OBJS) $(TASK_OBJS) $(LIBS) $(REC)
+
 clean:
 	rm -f $(TARGET) $(OBJS) $(TASK_OBJS)
 	rm -f *~ \#*
--- a/example/many_task/ppe/QuickSort.cc	Tue Oct 02 18:17:34 2012 +0900
+++ b/example/many_task/ppe/QuickSort.cc	Wed Nov 14 09:45:28 2012 +0900
@@ -9,17 +9,17 @@
 static void
 swap( Data *data, int left, int right )
 {
-    Data tmp          = data[left];
+    Data tmp    = data[left];
     data[left]  = data[right];
     data[right] = tmp;
 }
 
-// #define USE_MEMCPY
+//#define USE_MEMCPY
 
 static int
 run(SchedTask *s, void* rbuff, void* wbuff) {
     // copy value
-    int begin   = 0;
+    int begin  = 0;
 #if USE_SIMPLE_TASK
     int end = s->read_size()/sizeof(Data);
     Data *r_data = (Data*)rbuff;
@@ -37,7 +37,7 @@
     // printf("[PPE] Quick: length:%d addr->%x \n",end, (int)rbuff);
     // printf("[PPE] Quick: data[0]: %ld addr->%lx\n",sizeof(r_data),(long)r_data);
 
-    quick_sort(r_data, begin, end-1);
+    quick_sort(r_data, begin, end);
 
 #ifdef USE_MEMCPY
     memcpy(w_data, r_data, sizeof(Data)*end);
@@ -50,32 +50,38 @@
 
 void
 qsort_test(Data *data, int begin, int end ) {
-    quick_sort(data, begin, end);
-    printf("end is %d\n",end);
+    quick_sort(data, begin, end-1);
 }
 
 static void
-quick_sort(Data *data, int begin, int end ) {
+quick_sort( Data *data, int begin, int end ) {
+    int stack[1024];
+    int sp = 0;
+    int p;
+    while (1) {
+        while (begin < end) {
+            int where = (begin + end) / 2;
+            int pivot = data[where].index;
+            data[where].index = data[begin].index;
+            int i;
+            p = begin;
+            for (i=begin+1; i<=end; i++) {
+                if (data[i].index < pivot) {
+                    p++;
+                    swap(data, p, i);
+                }
+            }
+            data[begin].index = data[p].index;
+            data[p].index = pivot;
 
-    if (begin < end) {
-        int where = (begin + end) / 2;
-        int pivot = data[where].index;
-        data[where].index = data[begin].index;
-        int p = begin;
-        int i;
-        for (i=begin+1; i<end; i++) {
-            if (data[i].index < pivot) {
-                p++;
-                swap(data, p, i);
-            }
+            stack[sp++] = p + 1;
+            stack[sp++] = end;
+            end = p - 1;
         }
-        data[begin].index = data[p].index;
-        data[p].index = pivot;
-
-        quick_sort(data, begin, p-1);
-        quick_sort(data, p+1, end); // tail call
+        if (sp == 0) return;
+        end = stack[--sp];
+        begin = stack[--sp];
+        begin = p + 1;
     }
 }
-
-
 /* end */
--- a/example/many_task/ppe/QuickSort.cc.loop	Tue Oct 02 18:17:34 2012 +0900
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,97 +0,0 @@
-#include "QuickSort.h"
-#include <stdio.h>
-#include <string.h>
-
-SchedDefineTask(QuickSort);
-
-static void quick_sort( Data *data, int begin, int end ) ;
-extern void show_data(DataPtr, int);
-
-static void
-swap( Data *data, int left, int right )
-{
-    Data tmp          = data[left];
-    data[left]  = data[right];
-    data[right] = tmp;
-}
-
-// #define USE_MEMCPY
-
-static int
-run(SchedTask *s, void* rbuff, void* wbuff)
-{
-    // copy value
-    int begin   = 0;
-#if USE_SIMPLE_TASK
-    int end = s->read_size()/sizeof(Data);
-    Data *r_data = (Data*)rbuff;
-#ifdef USE_MEMCPY
-    Data *w_data = (Data*)wbuff;
-#endif
-#else
-    int end = s->get_inputSize(0)/sizeof(Data);
-    DataPtr r_data = (DataPtr)s->get_input(0);
-#ifdef USE_MEMCPY
-    DataPtr w_data = (DataPtr)s->get_output(0);
-#endif
-#endif
-
-    printf("[PPE] Quick: data[0]: %ld addr->%lx\n",sizeof(r_data),(long)r_data);
-
-    //    show_data(r_data, end);
-    quick_sort(r_data, begin, end-1);
-    //    show_data(r_data, end);
-#ifdef USE_MEMCPY
-    memcpy(w_data, r_data, sizeof(Data)*end);
-#else
-    s->swap();
-#endif
-
-    return 0;
-}
-
-void
-qsort_test(Data *data, int begin, int end ) {
-    quick_sort(data, begin, end);
-    printf("end is %d\n",end);
-}
-
-static void
-quick_sort( Data *data, int begin, int end ) {
-    int stack[1024];
-    int sp = 0;
-    int p = begin;
-    while (begin < end) {
-        while (begin < end) {
-            int where = (begin + end) / 2;
-            int pivot = data[where].index;
-            data[where].index = data[begin].index;
-            int i;
-            for (i=begin+1; i<end; i++) {
-                if (data[i].index < pivot) {
-                    p++;
-                    swap(data, p, i);
-                }
-            }
-            data[begin].index = data[p].index;
-            data[p].index = pivot;
-
-            stack[sp++] = begin;
-            stack[sp++] = end;
-            end = p-1;
-
-            // quick_sort(data, begin, p-1);
-            // beginとp-1のみが変わっている(これだけを保持)
-            // beginとp-1(end)用のスタックを作ってやればよい
-        }
-        if (sp == 0) return;
-        end = stack[--sp];
-        begin = stack[--sp];
-        begin = p+1;
-        // quick_sort(data, p+1, end); // tail call
-        // そのままループに
-    }
-}
-
-
-/* end */
--- a/example/many_task/ppe/sort_test.cc	Tue Oct 02 18:17:34 2012 +0900
+++ b/example/many_task/ppe/sort_test.cc	Wed Nov 14 09:45:28 2012 +0900
@@ -3,13 +3,21 @@
 #include <stdlib.h>
 //#include "sort.h"
 #include "QuickSort.h"
+#include <sys/time.h>
 // sort.cc
 extern int data_length;
 extern DataPtr data;
-extern void quick_sort(DataPtr, int, int);
 static int length = 1200;
 extern void qsort_test(Data*, int, int);
 
+static double
+getTime()
+{
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_sec + (double)tv.tv_usec*1e-6;
+}
+
 void
 show( Data *data, int size )
 {
@@ -33,6 +41,19 @@
 
 Sort sorter;
 
+void
+check_data()
+{
+    for(int i=0; i< sorter.data_length-1;i++) {
+        if (sorter.data[i].index>sorter.data[i+1].index)  {
+            printf("Data are not sorted at %d. %d > %d \n",i, sorter.data[i].index,sorter.data[i+1].index);
+            return;
+        }
+    }
+    printf("Data are sorted\n");
+}
+
+
 int
 main(int argc, char *argv[])
 {
@@ -50,9 +71,15 @@
 
     int begin = 0;
     int end = length;
-    show(sorter.data, end-1);
+    double st_time;
+    double ed_time;
+    //show(sorter.data, end-1);
+    st_time = getTime();
     qsort_test(sorter.data, begin, end);
-    show(sorter.data, end-1);
+    ed_time = getTime();
+    printf("Time: %0.6f\n",ed_time-st_time);
 
+    //show(sorter.data, end-1);
+    check_data();
     return 0;
 }
--- a/example/many_task/ppe/task_init.cc	Tue Oct 02 18:17:34 2012 +0900
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,17 +0,0 @@
-#include "Func.h"
-#include "Scheduler.h"
-
-SchedExternTask(QuickSort);
-SchedExternTask(SortSimple);
-SchedExternTask(SortCompat);
-
-void
-task_init(void)
-{
-    // ex
-    // SchedRegisterNDRange(dim,global_size, local_size)
-
-    SchedRegisterTask(QUICK_SORT, QuickSort);
-    SchedRegister(SortSimple);
-    SchedRegister(SortCompat);
-}
--- a/example/many_task/sort-compat.cc	Tue Oct 02 18:17:34 2012 +0900
+++ b/example/many_task/sort-compat.cc	Wed Nov 14 09:45:28 2012 +0900
@@ -6,6 +6,7 @@
 
 extern void check_data();
 extern int all;  // allocate task at once
+extern CPU_TYPE spe_cpu; 
 
 SchedDefineTask1(SortCompat, sort_start_compat );
 
@@ -37,7 +38,7 @@
 	if (i<s->split_num-2 && s->bsort[i]) {
 	    s->fsort[i]->wait_for(s->bsort[i]);
 	}
-	s->fsort[i]->set_cpu(SPE_ANY);
+	s->fsort[i]->set_cpu(spe_cpu);
     }
 
     // 最後の block は端数なので last_block_num を使う
@@ -50,7 +51,7 @@
 	if (i>0 && s->bsort[i-1]) {
 	    s->fsort[i]->wait_for(s->bsort[i-1]);
 	}
-	s->fsort[i]->set_cpu(SPE_ANY);
+	s->fsort[i]->set_cpu(spe_cpu);
    }
 
     if (s->split_num > 1) {
@@ -62,7 +63,7 @@
 				 sizeof(Data)*block_num);
 	    s->bsort[i]->set_outData(0,&s->data[i*block_num+half_block_num],
 				  sizeof(Data)*block_num);
-	    s->bsort[i]->set_cpu(SPE_ANY);
+	    s->bsort[i]->set_cpu(spe_cpu);
 	}
 
 	{
@@ -74,7 +75,7 @@
 				 sizeof(Data)*last_half_block_num);
 	    s->bsort[i]->set_outData(0,&s->data[i*block_num+half_block_num],
 				  sizeof(Data)*last_half_block_num);
-	    s->bsort[i]->set_cpu(SPE_ANY);	
+	    s->bsort[i]->set_cpu(spe_cpu);	
 	}
 	
 	for (int i = 0; i < half_num; i++) {
--- a/example/many_task/sort.cc	Tue Oct 02 18:17:34 2012 +0900
+++ b/example/many_task/sort.cc	Wed Nov 14 09:45:28 2012 +0900
@@ -6,6 +6,7 @@
 
 extern int get_split_num(int len, int num);
 extern int all;  // allocate task at once
+extern CPU_TYPE spe_cpu ;
 
 /**
  * 一つの block にある data の数が MAX_BLOCK_SIZE 超えないような
@@ -23,12 +24,12 @@
 get_split_num(int len, int num)
 {
     if (len / num < MAX_BLOCK_SIZE) {
-	return num;
+        return num;
     } else {
-	// 切り上げ
-	return (len + MAX_BLOCK_SIZE - 1) / MAX_BLOCK_SIZE;
+        // 切り上げ
+        return (len + MAX_BLOCK_SIZE - 1) / MAX_BLOCK_SIZE;
     }
-}	
+}
 
 
 /**
@@ -53,79 +54,91 @@
     int last_half_block_num = half_block_num+(last_block_num/2);
 
     if (--sort_count < 0) {
-	return 0;
+        return 0;
     }
 
 
+
     for (int i = 0; i < s->split_num-1; i++) {
-	s->fsort[i] = manager->create_task(QUICK_SORT,
-	    (memaddr)&s->data[i*block_num], sizeof(Data)*block_num,
-	    (memaddr)&s->data[i*block_num], sizeof(Data)*block_num);
-	if (i>0 && s->bsort[i-1]) {
-	    s->fsort[i]->wait_for(s->bsort[i-1]);
-	}
-	if (i<s->split_num-2 && s->bsort[i]) {
-	    s->fsort[i]->wait_for(s->bsort[i]);
-	}
-	s->fsort[i]->set_cpu(SPE_ANY);
+        s->fsort[i] = manager->create_task(QUICK_SORT,
+            (memaddr)&s->data[i*block_num], sizeof(Data)*block_num,
+            (memaddr)&s->data[i*block_num], sizeof(Data)*block_num);
+
+        s->fsort[i]->flip();
+
+        if (i>0 && s->bsort[i-1]) {
+            s->fsort[i]->wait_for(s->bsort[i-1]);
+        }
+        if (i<s->split_num-2 && s->bsort[i]) {
+            s->fsort[i]->wait_for(s->bsort[i]);
+        }
+        s->fsort[i]->set_cpu(GPU_0);
+        s->fsort[i]->set_param(0,(memaddr)block_num);
     }
 
     // 最後の block は端数なので last_block_num を使う
     {
-	int i = s->split_num-1;
+        int i = s->split_num-1;
 
-	s->fsort[i] = manager->create_task(QUICK_SORT,
-	    (memaddr)&s->data[i*block_num], sizeof(Data)*last_block_num,
-	    (memaddr)&s->data[i*block_num], sizeof(Data)*last_block_num);
-	if (i>0 && s->bsort[i-1]) {
-	    s->fsort[i]->wait_for(s->bsort[i-1]);
-	}
-	s->fsort[i]->set_cpu(SPE_ANY);
+        s->fsort[i] = manager->create_task(QUICK_SORT,
+            (memaddr)&s->data[i*block_num], sizeof(Data)*last_block_num,
+            (memaddr)&s->data[i*block_num], sizeof(Data)*last_block_num);
+        s->fsort[i]->flip();
+        if (i>0 && s->bsort[i-1]) {
+            s->fsort[i]->wait_for(s->bsort[i-1]);
+        }
+        s->fsort[i]->set_cpu(GPU_0);
+        s->fsort[i]->set_param(0,(memaddr)last_block_num);
    }
 
     if (s->split_num > 1) {
 
-	for (int i = 0; i < half_num-1; i++) {
-	    if (s->bsort[i]) manager->free_htask(s->bsort[i]);
-	    s->bsort[i] = manager->create_task(QUICK_SORT,
-		(memaddr)&s->data[i*block_num+half_block_num], sizeof(Data)*block_num,
-		(memaddr)&s->data[i*block_num+half_block_num], sizeof(Data)*block_num);
-	    s->bsort[i]->set_cpu(SPE_ANY);
-	}
+        for (int i = 0; i < half_num-1; i++) {
+            if (s->bsort[i]) manager->free_htask(s->bsort[i]);
+            s->bsort[i] = manager->create_task(QUICK_SORT,
+                (memaddr)&s->data[i*block_num+half_block_num], sizeof(Data)*block_num,
+                (memaddr)&s->data[i*block_num+half_block_num], sizeof(Data)*block_num);
+            s->bsort[i]->flip();
+            s->bsort[i]->set_cpu(GPU_0);
+            s->bsort[i]->set_param(0,(memaddr)block_num);
+        }
 
-	{
-	    int i = half_num-1;
+        {
+            int i = half_num-1;
 
-	    if (s->bsort[i]) manager->free_htask(s->bsort[i]);
-	    s->bsort[i] = manager->create_task(QUICK_SORT,
-		(memaddr)&s->data[i*block_num+half_block_num], sizeof(Data)*last_half_block_num,
-		(memaddr)&s->data[i*block_num+half_block_num], sizeof(Data)*last_half_block_num);
-	    s->bsort[i]->set_cpu(SPE_ANY);	
-	}
-	
-	for (int i = 0; i < half_num; i++) {
-	    s->bsort[i]->wait_for(s->fsort[i]);
-	    s->bsort[i]->wait_for(s->fsort[i+1]);
-	    s->bsort[i]->no_auto_free();
-	    s->bsort[i]->spawn();
-	}
+            if (s->bsort[i]) manager->free_htask(s->bsort[i]);
+            s->bsort[i] = manager->create_task(QUICK_SORT,
+                (memaddr)&s->data[i*block_num+half_block_num], sizeof(Data)*last_half_block_num,
+                (memaddr)&s->data[i*block_num+half_block_num], sizeof(Data)*last_half_block_num);
+            s->bsort[i]->flip();
+            s->bsort[i]->set_cpu(GPU_0);
+            s->bsort[i]->set_param(0,(memaddr)last_half_block_num);
+        }
+
+        for (int i = 0; i < half_num; i++) {
+            s->bsort[i]->wait_for(s->fsort[i]);
+            s->bsort[i]->wait_for(s->fsort[i+1]);
+            s->bsort[i]->no_auto_free();
+            s->bsort[i]->spawn();
+        }
     }
 
     HTaskPtr restart = manager->create_task(SortSimple,0,0,0,0);
     restart->set_param(0,(memaddr)s);
     if (!all) restart->wait_for(s->fsort[0]);
     for (int i = 0; i < s->split_num; i++) {
-	s->fsort[i]->spawn();
+        s->fsort[i]->spawn();
     }
     if (sort_count == 1) {
-	// last loop wait for all task 
-	// we should not need this?
-	for (int i = 0; i < half_num; i++) {
-	    restart->wait_for(s->bsort[i]);
-	    s->bsort[i]->auto_free();
-	}
+        // last loop wait for all task
+        // we should not need this?
+        for (int i = 0; i < half_num; i++) {
+            restart->wait_for(s->bsort[i]);
+            s->bsort[i]->auto_free();
+        }
     }
     restart->spawn();
+
     return 0;
 }
 
--- a/example/many_task/task_init.cc	Tue Oct 02 18:17:34 2012 +0900
+++ b/example/many_task/task_init.cc	Wed Nov 14 09:45:28 2012 +0900
@@ -1,14 +1,23 @@
 #include "Func.h"
 #include "Scheduler.h"
+#include "GpuScheduler.h"
 
+#ifndef __CERIUM_GPU__
 SchedExternTask(QuickSort);
+#endif // __CERIUM_GPU__
 SchedExternTask(SortSimple);
 SchedExternTask(SortCompat);
 
 void
 task_init(void)
 {
+#ifdef __CERIUM_GPU__
+    GpuSchedRegister(QUICK_SORT, "sort.cl", "sort");
+#else
     SchedRegisterTask(QUICK_SORT, QuickSort);
+#endif
+
+
     SchedRegister(SortSimple);
     SchedRegister(SortCompat);
 }