Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
pcg
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Bouillaguet Charles
pcg
Commits
20adb2de
Commit
20adb2de
authored
5 years ago
by
Bouillaguet Charles
Browse files
Options
Downloads
Patches
Plain Diff
main MPI avec checkpointing
parent
71e2ae3f
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
.gitignore
+1
-0
1 addition, 0 deletions
.gitignore
Cunknown/Makefile
+3
-0
3 additions, 0 deletions
Cunknown/Makefile
Cunknown/main.c
+174
-14
174 additions, 14 deletions
Cunknown/main.c
Cunknown/organisation.txt
+34
-0
34 additions, 0 deletions
Cunknown/organisation.txt
with
212 additions
and
14 deletions
.gitignore
+
1
−
0
View file @
20adb2de
...
...
@@ -6,3 +6,4 @@
/Cunknown/benchmark
/Cunknown/main
/Cunknown/benchmark_omp
/Cunknown/checkpoint.bin
This diff is collapsed.
Click to expand it.
Cunknown/Makefile
+
3
−
0
View file @
20adb2de
...
...
@@ -10,6 +10,9 @@ LDLIBS = -lm
all
:
main benchmark test test_falsenegative benchmark_omp
main.o
:
CC=mpicc
main
:
CC=mpicc
fonctions.o
:
fonctions.h
fonctions_bonus.o
:
fonctions.h
main.o
:
fonctions.h pcg_setseq.h
...
...
This diff is collapsed.
Click to expand it.
Cunknown/main.c
+
174
−
14
View file @
20adb2de
#include
<stdlib.h>
#include
<stdio.h>
#include
<err.h>
#include
<string.h>
#include
<time.h>
#include
<omp.h>
#include
<mpi.h>
#include
"fonctions.h"
static
const
bool
VERBOSE
=
true
;
static
const
char
*
CHKPT_FILENAME
=
"checkpoint.bin"
;
static
const
char
*
ALT_CHKPT_FILENAME
=
"checkpoint.bin.tmp"
;
enum
chkpt_status
{
GOOD_CHECKPOINT
,
NO_CHECKPOINT
,
BAD_CHECKPOINT
};
struct
checkpoint_t
{
int
nranks
;
int
known_bits
;
u64
X
[
nbiter
];
u64
done
;
time_t
when
;
};
enum
chkpt_status
load_chkpt
(
int
size
,
const
u64
*
X
,
u64
*
done
)
{
*
done
=
0
;
/* try to load checkpoint file */
FILE
*
f
=
fopen
(
CHKPT_FILENAME
,
"r"
);
if
(
f
==
NULL
)
{
perror
(
"Cannot open checkpoint file"
);
return
NO_CHECKPOINT
;
}
struct
checkpoint_t
chkpt
;
size_t
check
=
fread
(
&
chkpt
,
sizeof
(
chkpt
),
1
,
f
);
fclose
(
f
);
if
(
check
!=
1
)
{
perror
(
"Cannot read checkpoint from file"
);
return
NO_CHECKPOINT
;
}
/* verify checkpoint */
if
(
size
!=
chkpt
.
nranks
)
{
printf
(
"Communicator size mismatch. Now=%d, in checkpoint=%d.
\n
"
,
size
,
chkpt
.
nranks
);
return
BAD_CHECKPOINT
;
}
if
(
known_low
!=
chkpt
.
known_bits
)
{
printf
(
"Guessed bits mismatch. Now=%d, in checkpoint=%d.
\n
"
,
known_low
,
chkpt
.
known_bits
);
return
BAD_CHECKPOINT
;
}
for
(
int
i
=
0
;
i
<
nbiter
;
i
++
)
if
(
X
[
i
]
!=
chkpt
.
X
[
i
])
{
printf
(
"X[%d] mismatch. Now=%llx, in checkpoint=%llx.
\n
"
,
i
,
X
[
i
],
chkpt
.
X
[
i
]);
return
BAD_CHECKPOINT
;
}
/* checkpoint is fine */
struct
tm
*
tmp
;
tmp
=
localtime
(
&
chkpt
.
when
);
if
(
tmp
==
NULL
)
err
(
1
,
"localtime"
);
char
outstr
[
255
];
if
(
strftime
(
outstr
,
sizeof
(
outstr
),
"%Y-%m-%d %H:%M:%S"
,
tmp
)
==
0
)
errx
(
1
,
"strftime returned 0"
);
printf
(
"Correct checkpoint loaded from %s. Time = %s
\n
"
,
CHKPT_FILENAME
,
outstr
);
printf
(
"Tasks done per MPI rank: %lld.
\n
"
,
chkpt
.
done
);
*
done
=
chkpt
.
done
;
return
GOOD_CHECKPOINT
;
}
void
save_chkpt
(
const
u64
*
X
,
u64
done
)
{
struct
checkpoint_t
chkpt
;
int
size
;
MPI_Comm_size
(
MPI_COMM_WORLD
,
&
size
);
/* prepare checkpoint data */
chkpt
.
nranks
=
size
;
chkpt
.
known_bits
=
known_low
;
for
(
int
i
=
0
;
i
<
nbiter
;
i
++
)
chkpt
.
X
[
i
]
=
X
[
i
];
chkpt
.
done
=
done
;
chkpt
.
when
=
time
(
NULL
);
/* try to open alternate checkpoint file */
FILE
*
f
=
fopen
(
ALT_CHKPT_FILENAME
,
"w"
);
if
(
f
==
NULL
)
{
perror
(
"WARNING ! Cannot open temporary checkpoint file"
);
return
;
}
size_t
check
=
fwrite
(
&
chkpt
,
sizeof
(
chkpt
),
1
,
f
);
fclose
(
f
);
if
(
check
!=
1
)
{
perror
(
"WARNING ! Cannot write temporary checkpoint file"
);
return
;
}
/* writing the new checkpoint was successful: we erase an eventual old one. */
if
(
rename
(
ALT_CHKPT_FILENAME
,
CHKPT_FILENAME
)
!=
0
)
perror
(
"WARNING ! Cannot rename tmp checkpoint file"
);
}
/* invoked at the beginning. Sets the range for the current MPI rank. */
void
restart
(
const
u64
*
X
,
u64
*
range_start
,
u64
*
range_end
,
u64
*
done
)
{
int
rank
,
size
;
MPI_Comm_rank
(
MPI_COMM_WORLD
,
&
rank
);
MPI_Comm_size
(
MPI_COMM_WORLD
,
&
size
);
/* default */
u64
n_tasks
=
1
<<
(
2
*
known_low
-
1
);
u64
tasks_per_rank
=
n_tasks
/
size
;
if
(
n_tasks
%
size
!=
0
)
tasks_per_rank
+=
1
;
*
range_start
=
rank
*
tasks_per_rank
;
*
range_end
=
(
rank
+
1
)
*
tasks_per_rank
;
if
(
rank
==
0
)
{
enum
chkpt_status
status
=
load_chkpt
(
size
,
X
,
done
);
switch
(
status
)
{
case
BAD_CHECKPOINT
:
printf
(
"BAD CHECKPOINT. Refusing to start. Please clean up the mess
\n
"
);
exit
(
EXIT_FAILURE
);
case
NO_CHECKPOINT
:
printf
(
"COLD START.
\n
"
);
break
;
case
GOOD_CHECKPOINT
:
printf
(
"WARM START.
\n
"
);
break
;
}
}
MPI_Bcast
(
done
,
1
,
MPI_UNSIGNED_LONG_LONG
,
0
,
MPI_COMM_WORLD
);
*
range_start
+=
*
done
;
/* clip */
if
(
*
range_end
>
n_tasks
)
*
range_end
=
n_tasks
;
if
(
VERBOSE
)
printf
(
"MPI rank %d : [%llx:%llx]
\n
"
,
rank
,
*
range_start
,
*
range_end
);
}
/* checkpoints the current MPI rank */
void
checkpoint
(
const
u64
*
X
,
u64
done
)
{
int
rank
;
MPI_Comm_rank
(
MPI_COMM_WORLD
,
&
rank
);
/* synchronize everybody */
MPI_Barrier
(
MPI_COMM_WORLD
);
if
(
rank
==
0
)
save_chkpt
(
X
,
done
);
}
void
do_task
(
u64
current
,
struct
task_t
*
task
,
const
u64
*
X
)
{
u64
W0
=
current
>>
known_low
;
u64
WC
=
current
%
(
1
<<
known_low
);
u64
W0
=
current
>>
(
known_low
-
1
)
;
u64
WC
=
1
+
2
*
(
current
%
(
1
<<
(
known_low
-
1
))
);
printf
(
"Doing task %lld / %lld
\n
"
,
W0
,
WC
);
prepare_task
(
X
,
W0
,
WC
,
task
);
...
...
@@ -32,8 +189,13 @@ void do_task(u64 current, struct task_t *task, const u64 *X)
}
int
main
()
int
main
(
int
argc
,
char
**
argv
)
{
int
provided
;
MPI_Init_thread
(
&
argc
,
&
argv
,
MPI_THREAD_FUNNELED
,
&
provided
);
if
(
provided
<
MPI_THREAD_FUNNELED
)
errx
(
1
,
"MPI Thread support not sufficient"
);
/* INITIALISATION DES PARAMETRES */
init_var_globales
();
...
...
@@ -44,10 +206,8 @@ int main()
pcg128_t
vraiS
[
nboutput
];
pcg
(
vraiS
,
X
,
S0
,
&
c
,
nboutput
);
// restart();
u64
range_start
=
(
5018
<<
known_low
)
+
335
;
u64
range_end
=
range_start
+
10
;
u64
range_start
,
range_end
,
done
;
restart
(
X
,
&
range_start
,
&
range_end
,
&
done
);
double
t1
=
wtime
();
...
...
@@ -60,25 +220,25 @@ int main()
init_task
(
&
task
[
tid
]);
}
u64
current
=
range_start
;
while
(
current
<
range_end
)
{
while
(
range_start
<
range_end
)
{
#pragma omp parallel
{
int
tid
=
omp_get_thread_num
();
if
(
curren
t
+
tid
<
range_end
)
{
do_task
(
curren
t
+
tid
,
&
task
[
tid
],
X
);
if
(
range_star
t
+
tid
<
range_end
)
{
do_task
(
range_star
t
+
tid
,
&
task
[
tid
],
X
);
}
}
// checkpoint();
current
+=
T
;
range_start
+=
T
;
done
+=
T
;
checkpoint
(
X
,
done
);
}
/*if(DS640 == 7304601715607344736u){
printf("On a le bon !\n");
printf("DS640 = %llu\n", DS640);
}*/
printf
(
"temps total = %f
\n
"
,
wtime
()
-
t1
);
MPI_Finalize
();
return
(
0
);
}
This diff is collapsed.
Click to expand it.
Cunknown/organisation.txt
0 → 100644
+
34
−
0
View file @
20adb2de
Sur ppti-gpu-1 (44 coeurs skylake 6152 (2017) @ 2.1Ghz) :
vs jean-zay : cascade lake 6248 (2019) @ 2.5Ghz
known_low = 11
==============
Avec ICC
88 tâches en même temps --> 40.3s (2341 Msolve/s, 26.7 Msolve/s/thread)
44 tâches en même temps --> 26.9s (1753 Msolve/s, 39.8M Msolve/s/thread)
22 tâches en même temps --> 20.1s (1175 Msolve/s, 53.4 Msolve/s/thread)
1 tâches en même temps --> 17.8s (60.5 Msolve/s/thread)
et maintenant, on teste la parallélisation à l'intérieur des tâches
88 threads --> 10.9s, 1574.0M/s, 17.9M/s/thread
44 threads --> 20.53s, 836.7M/s, 19.0M/s/thread
22 threads --> 32.88s, 522.5M/s, 23.8M/s/thread
11 threads --> 58.52s, 293.6M/s, 26.7M/s/thread
c'est moins bien !
2048^2 / 80 tâches de 33.6s == 52 428 "jobs" de 33.6s --> 20000 h*CPU sur jz
known_low = 12
==============
80000 h*CPU sur jz
known_low = 13
==============
320000 h*CPU sur jz
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment