ll

82f99878 · zahoussem · 1ae6f17d · 82f99878
Commit 82f99878 authored Sep 28, 2019 by zahoussem
--- a/devel/unpack2.cu
+++ b/devel/unpack2.cu
@@ -2,42 +2,13 @@
 #include <tuple>
 #include <iostream> 

-using namespace std;
-

-// In this example we use a very small number of blocks
-// and threads in those blocks for illustration 
-// on a very small array
 #define N 8
-#define numThread 8 // 2 threads in a block
-#define numBlock 1  // 4 blocks
-
-/*
- * 1.
- *  The 'kernel' function that will be executed on the GPU device hardware.
- */
-__global__ void add( int *a, int *b, int *c ) {
-
-  printf("here \n");
-    int tid = blockDim.x * blockIdx.x + threadIdx.x;
-    while (tid < N) {
-        c[tid] = a[tid] + b[tid];      
-        tid += blockDim.x;                                      
-    }
-}
-




-template<typename ...Arguments>
-struct pruda_task_t {
-  tuple<Arguments...> args;
-
-};
-
-
-
+using namespace std;



@@ -65,26 +36,17 @@ struct make_indexes : make_indexes_impl<0, index_tuple<>, Types...>
 {}; 

 template<class Ret, class... Args, int... Indexes > 
-Ret apply_helper( Ret (*pf)(Args...), index_tuple< Indexes... >, tuple<Args...>&& tup) 
+Ret apply_helper(int gs, int bs, Ret (*pf)(Args...), index_tuple< Indexes... >, tuple<Args...>&& tup) 
 { 
-  (*pf)<<<1,2>>>( forward<Args>( get<Indexes>(tup))... );
-  // printf("%p \n",pf);
-  // (*pf)<<<10,10>>>(dev_a,dev_b,dev_c);// forward<Args>( get<Indexes>(tup))... );
-  // add<<<10,10>>>(dev_a,dev_b,dev_c);// forward<Args>( get<Indexes>(tup))... )
-  //  ;
+  (*pf)<<<gs,bs>>>( forward<Args>( get<Indexes>(tup))... );
 } 

 template<class Ret, class ... Args> 
-Ret apply(Ret (*pf)(Args...), const tuple<Args...>&  tup)
+Ret apply(int gs, int bs, Ret (*pf)(Args...), const tuple<Args...>&  tup)
 {
-    return apply_helper(pf, typename make_indexes<Args...>::type(), tuple<Args...>(tup));
+  return apply_helper(gs, bs, pf, typename make_indexes<Args...>::type(), tuple<Args...>(tup));
 }

-template<class Ret, class ... Args> 
-Ret apply(Ret (*pf)(Args...), tuple<Args...>&&  tup)
-{
-    return apply_helper(pf, typename make_indexes<Args...>::type(), forward<tuple<Args...>>(tup));
-}



@@ -92,150 +54,227 @@ Ret apply(Ret (*pf)(Args...), tuple<Args...>&&  tup)



+__global__ void add( int *a, int *b, int *c ) {

+  printf("here 2 \n");
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    while (tid < N) {
+        c[tid] = a[tid] + b[tid];      
+        tid += blockDim.x;                                      
+    }
+}




+__global__ void mul( int *a, int *b, int *c, int h ) {

+  printf("here mul \n");
+    int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    while (tid < N) {
+        c[tid] = a[tid] * b[tid];      
+        tid += blockDim.x;                                      
+    }
+}






+template<typename ...Arguments>
+struct kernel_t {
+  void (*kernel_c)(Arguments...);
+  tuple<Arguments...> args;
+  int bs;
+  int gs;
+};




+struct kernel_t<int *,int*,int *, int> m_1;
+struct kernel_t<int *,int*,int *> m_2;
+auto tu=make_tuple(&m_1,&m_2);



+inline void submit_task(int indexex){

+    switch ( indexex )
+    {
+    case 0:
+      apply(get<0>(tu)->gs,get<0>(tu)->bs,get<0>(tu)->kernel_c,get<0>(tu)->args);
+      break;

+     case 1:
+       printf("moi \n");
+       apply(get<1>(tu)->gs,get<1>(tu)->bs,get<1>(tu)->kernel_c,get<1>(tu)->args);
+       break;
+       //Houssam: I need to add this later

+    // case 2:
+    //   apply(get<2>(tu)->gs,get<2>(tu)->bs,get<2>(tu)->kernel_c,get<2>(tu)->args);
+    //   break;
 	   
 	   
+    default:
+      printf("unknown task, exitting \n");
+      exit(-1);
+    }
 


+}





-/*
- * The main program that directs the execution of vector add on the GPU
- */
-int main( void ) {
-    int *a, *b, *c;               // The arrays on the host CPU machine
-    int *dev_a, *dev_b, *dev_c;   // The arrays for the GPU device
+template<typename ...Arguments>
+struct kernel_t<Arguments...>  *  create_kernel_t(void kernel_c(Arguments...),  int gs, int bs,
+							    Arguments...args){
+  struct kernel_t<Arguments...> * tau = (struct kernel_t<Arguments...> *)
+    (malloc(sizeof(struct kernel_t<Arguments...> )));

-    // 2.a allocate the memory on the CPU
-    a = (int*)malloc( N * sizeof(int) );
-    b = (int*)malloc( N * sizeof(int) );
-    c = (int*)malloc( N * sizeof(int) );

-    // 2.b. fill the arrays 'a' and 'b' on the CPU with dummy values
-    for (int i=0; i<N; i++) {
-        a[i] = i;
-        b[i] = i;
+  tau->kernel_c = kernel_c;
+  tau->args = tuple<Arguments...>(args...);
+  tau->gs = gs;
+  tau->bs = bs;
+  return tau;
 }

-    // 2.c. allocate the memory on the GPU
-     cudaMalloc( (void**)&dev_a, N * sizeof(int) );
-     cudaMalloc( (void**)&dev_b, N * sizeof(int) );
-     cudaMalloc( (void**)&dev_c, N * sizeof(int) );

-    // 2.d. copy the arrays 'a' and 'b' to the GPU
-     cudaMemcpy( dev_a, a, N * sizeof(int),
-                              cudaMemcpyHostToDevice );
-     cudaMemcpy( dev_b, b, N * sizeof(int),
-                              cudaMemcpyHostToDevice );
+
+
+template<typename ...Arguments>
+void create_kernel(struct kernel_t<Arguments ...> * k, void kernel_c(Arguments...),  int gs, int bs,
+						Arguments...args){
+
+  k->kernel_c = kernel_c;
+  k->args = tuple<Arguments...>(args...);
+  k->gs = gs;
+  k->bs = bs;
+  
+}
+
+
+
+
+
+int *a, *b, *c;  
+int *dev_a, *dev_b, *dev_c; 
+  
+int h;
+
+




-     // I need to make that working 

-     struct pruda_task_t<int*,int*,int*>  * me = (struct pruda_task_t<int*,int*,int*> *)(malloc(sizeof(struct pruda_task_t<int*,int*,int*> )));


+template<typename ...Arguments>

+struct kern_list {
+tuple<Arguments...> tu;
+};



+template<typename Arguments>
+struct kern_list_ {
+Arguments  tu;
+};




+template<typename ...Arguments> kern_list<Arguments...> *k_list;
+template<typename Arguments> kern_list_<Arguments> *ker_list;




-   std::tuple<int*, int *, int*> tup(dev_a,dev_b,dev_c);
-  //std::tuple<int, double > tup(10,20.0);
-   apply(add, tup);
+int main( void ) {


 
  
+  a = (int*)malloc( N * sizeof(int) );
+  b = (int*)malloc( N * sizeof(int) );
+  c = (int*)malloc( N * sizeof(int) );
  
  
+  for (int i=0; i<N; i++) {
+    a[i] = i;
+    b[i] = i;
+  }
  
  
+  cudaMalloc( (void**)&dev_a, N * sizeof(int) );
+  cudaMalloc( (void**)&dev_b, N * sizeof(int) );
+  cudaMalloc( (void**)&dev_c, N * sizeof(int) );

   
+  cudaMemcpy( dev_a, a, N * sizeof(int),
+	      cudaMemcpyHostToDevice );
+  cudaMemcpy( dev_b, b, N * sizeof(int),
+	      cudaMemcpyHostToDevice );
     
+  int ac=5;
+  // struct kernel_t<int*,int*,int*, int>  * m1 = create_kernel_t(mul,2,5,dev_a,dev_b,dev_c,ac); 
+  // struct kernel_t<int*,int*,int*> * ad  = create_kernel_t(add,2,5,dev_a,dev_b,dev_c); 



+  create_kernel(&m_1,mul,2,5,dev_a,dev_b,dev_c,ac);
+  printf("%d print \n",m_1.gs);
+  create_kernel(&m_2,add,1,5,dev_a,dev_b,dev_c); 
+  printf("%d print \n",m_2.gs);
+  printf("%p print %p \n",m_2.kernel_c,add);

+  // auto LLOOl=<int*,int*,int*>;

  
+  //auto qq=make_tuple(mu,ad,mu);
  
+  // using typeq = decltype(qq);
+   // ker_list = (struct kern_list_<typeq> *)(malloc(sizeof(struct kern_list_<typeq>)));
   
+   // ker_list->tu = qq;
   

  
+  // qq.p


+  //  auto ss = get<0>(l->tu); 
 

  
+  // std::tuple<std::tuple<kernel_t<int *, int *, int *, int> *, kernel_t<int *, int *, int *> *, kernel_t<int *, int *, int *, int> *> *

    
+  int index = 1;



+  submit_task(1);       

-    // 3. Execute the vector addition 'kernel function' on th GPU device,
-    // declaring how many blocks and how many threads per block to use.
-   // add<<<numBlock,numThread>>>( dev_a, dev_b, dev_c );
     
-    // 4. copy the array 'c' back from the GPU to the CPU
  cudaMemcpy( c, dev_c, N * sizeof(int),
 	      cudaMemcpyDeviceToHost );

-    // verify that the GPU did the work we requested
-    bool success = true;
-    int total=0;
-    printf("Checking %d values in the array.\n", N);
-    for (int i=0; i<N; i++) {
-        if ((a[i] + b[i]) != c[i]) {
-            printf( "Error:  %d + %d != %d\n", a[i], b[i], c[i] );
-            success = false;
-        }
-        total += 1;
-    }
-    if (success)  printf( "We did it, %d values correct!\n", total );

-    // free the memory we allocated on the CPU
+
+   
  free( a );
  free( b );
  free( c );
   
-    // free the memory we allocated on the GPU
  cudaFree( dev_a );
  cudaFree( dev_b );
  cudaFree( dev_c );