Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
R
rtgpgpu
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
PTASK
rtgpgpu
Commits
82f99878
Commit
82f99878
authored
5 years ago
by
zahoussem
Browse files
Options
Downloads
Patches
Plain Diff
ll
parent
1ae6f17d
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
devel/unpack2.cu
+150
-111
150 additions, 111 deletions
devel/unpack2.cu
with
150 additions
and
111 deletions
devel/unpack2.cu
+
150
−
111
View file @
82f99878
...
...
@@ -2,42 +2,13 @@
#include
<tuple>
#include
<iostream>
using
namespace
std
;
// In this example we use a very small number of blocks
// and threads in those blocks for illustration
// on a very small array
#define N 8
#define numThread 8 // 2 threads in a block
#define numBlock 1 // 4 blocks
/*
* 1.
* The 'kernel' function that will be executed on the GPU device hardware.
*/
__global__
void
add
(
int
*
a
,
int
*
b
,
int
*
c
)
{
printf
(
"here
\n
"
);
int
tid
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
while
(
tid
<
N
)
{
c
[
tid
]
=
a
[
tid
]
+
b
[
tid
];
tid
+=
blockDim
.
x
;
}
}
template
<
typename
...
Arguments
>
struct
pruda_task_t
{
tuple
<
Arguments
...
>
args
;
};
using
namespace
std
;
...
...
@@ -65,26 +36,17 @@ struct make_indexes : make_indexes_impl<0, index_tuple<>, Types...>
{};
template
<
class
Ret
,
class
...
Args
,
int
...
Indexes
>
Ret
apply_helper
(
Ret
(
*
pf
)(
Args
...),
index_tuple
<
Indexes
...
>
,
tuple
<
Args
...
>&&
tup
)
Ret
apply_helper
(
int
gs
,
int
bs
,
Ret
(
*
pf
)(
Args
...),
index_tuple
<
Indexes
...
>
,
tuple
<
Args
...
>&&
tup
)
{
(
*
pf
)
<<<
1
,
2
>>>
(
forward
<
Args
>
(
get
<
Indexes
>
(
tup
))...
);
// printf("%p \n",pf);
// (*pf)<<<10,10>>>(dev_a,dev_b,dev_c);// forward<Args>( get<Indexes>(tup))... );
// add<<<10,10>>>(dev_a,dev_b,dev_c);// forward<Args>( get<Indexes>(tup))... )
// ;
(
*
pf
)
<<<
gs
,
bs
>>>
(
forward
<
Args
>
(
get
<
Indexes
>
(
tup
))...
);
}
template
<
class
Ret
,
class
...
Args
>
Ret
apply
(
Ret
(
*
pf
)(
Args
...),
const
tuple
<
Args
...
>&
tup
)
Ret
apply
(
int
gs
,
int
bs
,
Ret
(
*
pf
)(
Args
...),
const
tuple
<
Args
...
>&
tup
)
{
return
apply_helper
(
pf
,
typename
make_indexes
<
Args
...
>::
type
(),
tuple
<
Args
...
>
(
tup
));
return
apply_helper
(
gs
,
bs
,
pf
,
typename
make_indexes
<
Args
...
>::
type
(),
tuple
<
Args
...
>
(
tup
));
}
template
<
class
Ret
,
class
...
Args
>
Ret
apply
(
Ret
(
*
pf
)(
Args
...),
tuple
<
Args
...
>&&
tup
)
{
return
apply_helper
(
pf
,
typename
make_indexes
<
Args
...
>::
type
(),
forward
<
tuple
<
Args
...
>>
(
tup
));
}
...
...
@@ -92,150 +54,227 @@ Ret apply(Ret (*pf)(Args...), tuple<Args...>&& tup)
__global__
void
add
(
int
*
a
,
int
*
b
,
int
*
c
)
{
printf
(
"here 2
\n
"
);
int
tid
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
while
(
tid
<
N
)
{
c
[
tid
]
=
a
[
tid
]
+
b
[
tid
];
tid
+=
blockDim
.
x
;
}
}
__global__
void
mul
(
int
*
a
,
int
*
b
,
int
*
c
,
int
h
)
{
printf
(
"here mul
\n
"
);
int
tid
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
while
(
tid
<
N
)
{
c
[
tid
]
=
a
[
tid
]
*
b
[
tid
];
tid
+=
blockDim
.
x
;
}
}
template
<
typename
...
Arguments
>
struct
kernel_t
{
void
(
*
kernel_c
)(
Arguments
...);
tuple
<
Arguments
...
>
args
;
int
bs
;
int
gs
;
};
struct
kernel_t
<
int
*
,
int
*
,
int
*
,
int
>
m_1
;
struct
kernel_t
<
int
*
,
int
*
,
int
*>
m_2
;
auto
tu
=
make_tuple
(
&
m_1
,
&
m_2
);
inline
void
submit_task
(
int
indexex
){
switch
(
indexex
)
{
case
0
:
apply
(
get
<
0
>
(
tu
)
->
gs
,
get
<
0
>
(
tu
)
->
bs
,
get
<
0
>
(
tu
)
->
kernel_c
,
get
<
0
>
(
tu
)
->
args
);
break
;
case
1
:
printf
(
"moi
\n
"
);
apply
(
get
<
1
>
(
tu
)
->
gs
,
get
<
1
>
(
tu
)
->
bs
,
get
<
1
>
(
tu
)
->
kernel_c
,
get
<
1
>
(
tu
)
->
args
);
break
;
//Houssam: I need to add this later
// case 2:
// apply(get<2>(tu)->gs,get<2>(tu)->bs,get<2>(tu)->kernel_c,get<2>(tu)->args);
// break;
default:
printf
(
"unknown task, exitting
\n
"
);
exit
(
-
1
);
}
}
/*
* The main program that directs the execution of vector add on the GPU
*/
int
main
(
void
)
{
int
*
a
,
*
b
,
*
c
;
// The arrays on the host CPU machine
int
*
dev_a
,
*
dev_b
,
*
dev_c
;
// The arrays for the GPU device
template
<
typename
...
Arguments
>
struct
kernel_t
<
Arguments
...
>
*
create_kernel_t
(
void
kernel_c
(
Arguments
...),
int
gs
,
int
bs
,
Arguments
...
args
){
struct
kernel_t
<
Arguments
...
>
*
tau
=
(
struct
kernel_t
<
Arguments
...
>
*
)
(
malloc
(
sizeof
(
struct
kernel_t
<
Arguments
...
>
)));
// 2.a allocate the memory on the CPU
a
=
(
int
*
)
malloc
(
N
*
sizeof
(
int
)
);
b
=
(
int
*
)
malloc
(
N
*
sizeof
(
int
)
);
c
=
(
int
*
)
malloc
(
N
*
sizeof
(
int
)
);
// 2.b. fill the arrays 'a' and 'b' on the CPU with dummy values
for
(
int
i
=
0
;
i
<
N
;
i
++
)
{
a
[
i
]
=
i
;
b
[
i
]
=
i
;
tau
->
kernel_c
=
kernel_c
;
tau
->
args
=
tuple
<
Arguments
...
>
(
args
...);
tau
->
gs
=
gs
;
tau
->
bs
=
bs
;
return
tau
;
}
// 2.c. allocate the memory on the GPU
cudaMalloc
(
(
void
**
)
&
dev_a
,
N
*
sizeof
(
int
)
);
cudaMalloc
(
(
void
**
)
&
dev_b
,
N
*
sizeof
(
int
)
);
cudaMalloc
(
(
void
**
)
&
dev_c
,
N
*
sizeof
(
int
)
);
// 2.d. copy the arrays 'a' and 'b' to the GPU
cudaMemcpy
(
dev_a
,
a
,
N
*
sizeof
(
int
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
dev_b
,
b
,
N
*
sizeof
(
int
),
cudaMemcpyHostToDevice
);
template
<
typename
...
Arguments
>
void
create_kernel
(
struct
kernel_t
<
Arguments
...
>
*
k
,
void
kernel_c
(
Arguments
...),
int
gs
,
int
bs
,
Arguments
...
args
){
k
->
kernel_c
=
kernel_c
;
k
->
args
=
tuple
<
Arguments
...
>
(
args
...);
k
->
gs
=
gs
;
k
->
bs
=
bs
;
}
int
*
a
,
*
b
,
*
c
;
int
*
dev_a
,
*
dev_b
,
*
dev_c
;
int
h
;
// I need to make that working
struct
pruda_task_t
<
int
*
,
int
*
,
int
*>
*
me
=
(
struct
pruda_task_t
<
int
*
,
int
*
,
int
*>
*
)(
malloc
(
sizeof
(
struct
pruda_task_t
<
int
*
,
int
*
,
int
*>
)));
template
<
typename
...
Arguments
>
struct
kern_list
{
tuple
<
Arguments
...
>
tu
;
};
template
<
typename
Arguments
>
struct
kern_list_
{
Arguments
tu
;
};
template
<
typename
...
Arguments
>
kern_list
<
Arguments
...
>
*
k_list
;
template
<
typename
Arguments
>
kern_list_
<
Arguments
>
*
ker_list
;
std
::
tuple
<
int
*
,
int
*
,
int
*>
tup
(
dev_a
,
dev_b
,
dev_c
);
//std::tuple<int, double > tup(10,20.0);
apply
(
add
,
tup
);
int
main
(
void
)
{
a
=
(
int
*
)
malloc
(
N
*
sizeof
(
int
)
);
b
=
(
int
*
)
malloc
(
N
*
sizeof
(
int
)
);
c
=
(
int
*
)
malloc
(
N
*
sizeof
(
int
)
);
for
(
int
i
=
0
;
i
<
N
;
i
++
)
{
a
[
i
]
=
i
;
b
[
i
]
=
i
;
}
cudaMalloc
(
(
void
**
)
&
dev_a
,
N
*
sizeof
(
int
)
);
cudaMalloc
(
(
void
**
)
&
dev_b
,
N
*
sizeof
(
int
)
);
cudaMalloc
(
(
void
**
)
&
dev_c
,
N
*
sizeof
(
int
)
);
cudaMemcpy
(
dev_a
,
a
,
N
*
sizeof
(
int
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
dev_b
,
b
,
N
*
sizeof
(
int
),
cudaMemcpyHostToDevice
);
int
ac
=
5
;
// struct kernel_t<int*,int*,int*, int> * m1 = create_kernel_t(mul,2,5,dev_a,dev_b,dev_c,ac);
// struct kernel_t<int*,int*,int*> * ad = create_kernel_t(add,2,5,dev_a,dev_b,dev_c);
create_kernel
(
&
m_1
,
mul
,
2
,
5
,
dev_a
,
dev_b
,
dev_c
,
ac
);
printf
(
"%d print
\n
"
,
m_1
.
gs
);
create_kernel
(
&
m_2
,
add
,
1
,
5
,
dev_a
,
dev_b
,
dev_c
);
printf
(
"%d print
\n
"
,
m_2
.
gs
);
printf
(
"%p print %p
\n
"
,
m_2
.
kernel_c
,
add
);
// auto LLOOl=<int*,int*,int*>;
//auto qq=make_tuple(mu,ad,mu);
// using typeq = decltype(qq);
// ker_list = (struct kern_list_<typeq> *)(malloc(sizeof(struct kern_list_<typeq>)));
// ker_list->tu = qq;
// qq.p
// auto ss = get<0>(l->tu);
// std::tuple<std::tuple<kernel_t<int *, int *, int *, int> *, kernel_t<int *, int *, int *> *, kernel_t<int *, int *, int *, int> *> *
int
index
=
1
;
submit_task
(
1
);
// 3. Execute the vector addition 'kernel function' on th GPU device,
// declaring how many blocks and how many threads per block to use.
// add<<<numBlock,numThread>>>( dev_a, dev_b, dev_c );
// 4. copy the array 'c' back from the GPU to the CPU
cudaMemcpy
(
c
,
dev_c
,
N
*
sizeof
(
int
),
cudaMemcpyDeviceToHost
);
// verify that the GPU did the work we requested
bool
success
=
true
;
int
total
=
0
;
printf
(
"Checking %d values in the array.
\n
"
,
N
);
for
(
int
i
=
0
;
i
<
N
;
i
++
)
{
if
((
a
[
i
]
+
b
[
i
])
!=
c
[
i
])
{
printf
(
"Error: %d + %d != %d
\n
"
,
a
[
i
],
b
[
i
],
c
[
i
]
);
success
=
false
;
}
total
+=
1
;
}
if
(
success
)
printf
(
"We did it, %d values correct!
\n
"
,
total
);
// free the memory we allocated on the CPU
free
(
a
);
free
(
b
);
free
(
c
);
// free the memory we allocated on the GPU
cudaFree
(
dev_a
);
cudaFree
(
dev_b
);
cudaFree
(
dev_c
);
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment