Skip to content
Snippets Groups Projects
Commit 82f99878 authored by zahoussem's avatar zahoussem
Browse files

ll

parent 1ae6f17d
No related branches found
No related tags found
No related merge requests found
......@@ -2,42 +2,13 @@
#include <tuple>
#include <iostream>
using namespace std;
// In this example we use a very small number of blocks
// and threads in those blocks for illustration
// on a very small array
#define N 8
#define numThread 8 // 2 threads in a block
#define numBlock 1 // 4 blocks
/*
* 1.
* The 'kernel' function that will be executed on the GPU device hardware.
*/
__global__ void add( int *a, int *b, int *c ) {
printf("here \n");
int tid = blockDim.x * blockIdx.x + threadIdx.x;
while (tid < N) {
c[tid] = a[tid] + b[tid];
tid += blockDim.x;
}
}
template<typename ...Arguments>
struct pruda_task_t {
tuple<Arguments...> args;
};
using namespace std;
......@@ -65,26 +36,17 @@ struct make_indexes : make_indexes_impl<0, index_tuple<>, Types...>
{};
template<class Ret, class... Args, int... Indexes >
Ret apply_helper( Ret (*pf)(Args...), index_tuple< Indexes... >, tuple<Args...>&& tup)
Ret apply_helper(int gs, int bs, Ret (*pf)(Args...), index_tuple< Indexes... >, tuple<Args...>&& tup)
{
(*pf)<<<1,2>>>( forward<Args>( get<Indexes>(tup))... );
// printf("%p \n",pf);
// (*pf)<<<10,10>>>(dev_a,dev_b,dev_c);// forward<Args>( get<Indexes>(tup))... );
// add<<<10,10>>>(dev_a,dev_b,dev_c);// forward<Args>( get<Indexes>(tup))... )
// ;
(*pf)<<<gs,bs>>>( forward<Args>( get<Indexes>(tup))... );
}
template<class Ret, class ... Args>
Ret apply(Ret (*pf)(Args...), const tuple<Args...>& tup)
Ret apply(int gs, int bs, Ret (*pf)(Args...), const tuple<Args...>& tup)
{
return apply_helper(pf, typename make_indexes<Args...>::type(), tuple<Args...>(tup));
return apply_helper(gs, bs, pf, typename make_indexes<Args...>::type(), tuple<Args...>(tup));
}
template<class Ret, class ... Args>
Ret apply(Ret (*pf)(Args...), tuple<Args...>&& tup)
{
return apply_helper(pf, typename make_indexes<Args...>::type(), forward<tuple<Args...>>(tup));
}
......@@ -92,150 +54,227 @@ Ret apply(Ret (*pf)(Args...), tuple<Args...>&& tup)
__global__ void add( int *a, int *b, int *c ) {
printf("here 2 \n");
int tid = blockDim.x * blockIdx.x + threadIdx.x;
while (tid < N) {
c[tid] = a[tid] + b[tid];
tid += blockDim.x;
}
}
__global__ void mul( int *a, int *b, int *c, int h ) {
printf("here mul \n");
int tid = blockDim.x * blockIdx.x + threadIdx.x;
while (tid < N) {
c[tid] = a[tid] * b[tid];
tid += blockDim.x;
}
}
template<typename ...Arguments>
struct kernel_t {
void (*kernel_c)(Arguments...);
tuple<Arguments...> args;
int bs;
int gs;
};
struct kernel_t<int *,int*,int *, int> m_1;
struct kernel_t<int *,int*,int *> m_2;
auto tu=make_tuple(&m_1,&m_2);
inline void submit_task(int indexex){
switch ( indexex )
{
case 0:
apply(get<0>(tu)->gs,get<0>(tu)->bs,get<0>(tu)->kernel_c,get<0>(tu)->args);
break;
case 1:
printf("moi \n");
apply(get<1>(tu)->gs,get<1>(tu)->bs,get<1>(tu)->kernel_c,get<1>(tu)->args);
break;
//Houssam: I need to add this later
// case 2:
// apply(get<2>(tu)->gs,get<2>(tu)->bs,get<2>(tu)->kernel_c,get<2>(tu)->args);
// break;
default:
printf("unknown task, exitting \n");
exit(-1);
}
}
/*
* The main program that directs the execution of vector add on the GPU
*/
int main( void ) {
int *a, *b, *c; // The arrays on the host CPU machine
int *dev_a, *dev_b, *dev_c; // The arrays for the GPU device
template<typename ...Arguments>
struct kernel_t<Arguments...> * create_kernel_t(void kernel_c(Arguments...), int gs, int bs,
Arguments...args){
struct kernel_t<Arguments...> * tau = (struct kernel_t<Arguments...> *)
(malloc(sizeof(struct kernel_t<Arguments...> )));
// 2.a allocate the memory on the CPU
a = (int*)malloc( N * sizeof(int) );
b = (int*)malloc( N * sizeof(int) );
c = (int*)malloc( N * sizeof(int) );
// 2.b. fill the arrays 'a' and 'b' on the CPU with dummy values
for (int i=0; i<N; i++) {
a[i] = i;
b[i] = i;
tau->kernel_c = kernel_c;
tau->args = tuple<Arguments...>(args...);
tau->gs = gs;
tau->bs = bs;
return tau;
}
// 2.c. allocate the memory on the GPU
cudaMalloc( (void**)&dev_a, N * sizeof(int) );
cudaMalloc( (void**)&dev_b, N * sizeof(int) );
cudaMalloc( (void**)&dev_c, N * sizeof(int) );
// 2.d. copy the arrays 'a' and 'b' to the GPU
cudaMemcpy( dev_a, a, N * sizeof(int),
cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, N * sizeof(int),
cudaMemcpyHostToDevice );
template<typename ...Arguments>
void create_kernel(struct kernel_t<Arguments ...> * k, void kernel_c(Arguments...), int gs, int bs,
Arguments...args){
k->kernel_c = kernel_c;
k->args = tuple<Arguments...>(args...);
k->gs = gs;
k->bs = bs;
}
int *a, *b, *c;
int *dev_a, *dev_b, *dev_c;
int h;
// I need to make that working
struct pruda_task_t<int*,int*,int*> * me = (struct pruda_task_t<int*,int*,int*> *)(malloc(sizeof(struct pruda_task_t<int*,int*,int*> )));
template<typename ...Arguments>
struct kern_list {
tuple<Arguments...> tu;
};
template<typename Arguments>
struct kern_list_ {
Arguments tu;
};
template<typename ...Arguments> kern_list<Arguments...> *k_list;
template<typename Arguments> kern_list_<Arguments> *ker_list;
std::tuple<int*, int *, int*> tup(dev_a,dev_b,dev_c);
//std::tuple<int, double > tup(10,20.0);
apply(add, tup);
int main( void ) {
a = (int*)malloc( N * sizeof(int) );
b = (int*)malloc( N * sizeof(int) );
c = (int*)malloc( N * sizeof(int) );
for (int i=0; i<N; i++) {
a[i] = i;
b[i] = i;
}
cudaMalloc( (void**)&dev_a, N * sizeof(int) );
cudaMalloc( (void**)&dev_b, N * sizeof(int) );
cudaMalloc( (void**)&dev_c, N * sizeof(int) );
cudaMemcpy( dev_a, a, N * sizeof(int),
cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, N * sizeof(int),
cudaMemcpyHostToDevice );
int ac=5;
// struct kernel_t<int*,int*,int*, int> * m1 = create_kernel_t(mul,2,5,dev_a,dev_b,dev_c,ac);
// struct kernel_t<int*,int*,int*> * ad = create_kernel_t(add,2,5,dev_a,dev_b,dev_c);
create_kernel(&m_1,mul,2,5,dev_a,dev_b,dev_c,ac);
printf("%d print \n",m_1.gs);
create_kernel(&m_2,add,1,5,dev_a,dev_b,dev_c);
printf("%d print \n",m_2.gs);
printf("%p print %p \n",m_2.kernel_c,add);
// auto LLOOl=<int*,int*,int*>;
//auto qq=make_tuple(mu,ad,mu);
// using typeq = decltype(qq);
// ker_list = (struct kern_list_<typeq> *)(malloc(sizeof(struct kern_list_<typeq>)));
// ker_list->tu = qq;
// qq.p
// auto ss = get<0>(l->tu);
// std::tuple<std::tuple<kernel_t<int *, int *, int *, int> *, kernel_t<int *, int *, int *> *, kernel_t<int *, int *, int *, int> *> *
int index = 1;
submit_task(1);
// 3. Execute the vector addition 'kernel function' on th GPU device,
// declaring how many blocks and how many threads per block to use.
// add<<<numBlock,numThread>>>( dev_a, dev_b, dev_c );
// 4. copy the array 'c' back from the GPU to the CPU
cudaMemcpy( c, dev_c, N * sizeof(int),
cudaMemcpyDeviceToHost );
// verify that the GPU did the work we requested
bool success = true;
int total=0;
printf("Checking %d values in the array.\n", N);
for (int i=0; i<N; i++) {
if ((a[i] + b[i]) != c[i]) {
printf( "Error: %d + %d != %d\n", a[i], b[i], c[i] );
success = false;
}
total += 1;
}
if (success) printf( "We did it, %d values correct!\n", total );
// free the memory we allocated on the CPU
free( a );
free( b );
free( c );
// free the memory we allocated on the GPU
cudaFree( dev_a );
cudaFree( dev_b );
cudaFree( dev_c );
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment