Skip to content
Snippets Groups Projects
Commit 20adb2de authored by Bouillaguet Charles's avatar Bouillaguet Charles
Browse files

main MPI avec checkpointing

parent 71e2ae3f
Branches
No related tags found
No related merge requests found
......@@ -6,3 +6,4 @@
/Cunknown/benchmark
/Cunknown/main
/Cunknown/benchmark_omp
/Cunknown/checkpoint.bin
......@@ -10,6 +10,9 @@ LDLIBS = -lm
all: main benchmark test test_falsenegative benchmark_omp
main.o: CC=mpicc
main: CC=mpicc
fonctions.o: fonctions.h
fonctions_bonus.o: fonctions.h
main.o: fonctions.h pcg_setseq.h
......
#include <stdlib.h>
#include <stdio.h>
#include <err.h>
#include <string.h>
#include <time.h>
#include <omp.h>
#include <mpi.h>
#include "fonctions.h"
static const bool VERBOSE = true;
static const char *CHKPT_FILENAME = "checkpoint.bin";
static const char *ALT_CHKPT_FILENAME = "checkpoint.bin.tmp";
enum chkpt_status {GOOD_CHECKPOINT, NO_CHECKPOINT, BAD_CHECKPOINT};
struct checkpoint_t {
int nranks;
int known_bits;
u64 X[nbiter];
u64 done;
time_t when;
};
enum chkpt_status load_chkpt(int size, const u64 *X, u64 *done)
{
*done = 0;
/* try to load checkpoint file */
FILE *f = fopen(CHKPT_FILENAME, "r");
if (f == NULL) {
perror("Cannot open checkpoint file");
return NO_CHECKPOINT;
}
struct checkpoint_t chkpt;
size_t check = fread(&chkpt, sizeof(chkpt), 1, f);
fclose(f);
if (check != 1) {
perror("Cannot read checkpoint from file");
return NO_CHECKPOINT;
}
/* verify checkpoint */
if (size != chkpt.nranks) {
printf("Communicator size mismatch. Now=%d, in checkpoint=%d.\n", size, chkpt.nranks);
return BAD_CHECKPOINT;
}
if (known_low != chkpt.known_bits) {
printf("Guessed bits mismatch. Now=%d, in checkpoint=%d.\n", known_low, chkpt.known_bits);
return BAD_CHECKPOINT;
}
for (int i = 0; i < nbiter; i++)
if (X[i] != chkpt.X[i]) {
printf("X[%d] mismatch. Now=%llx, in checkpoint=%llx.\n", i, X[i], chkpt.X[i]);
return BAD_CHECKPOINT;
}
/* checkpoint is fine */
struct tm *tmp;
tmp = localtime(&chkpt.when);
if (tmp == NULL)
err(1, "localtime");
char outstr[255];
if (strftime(outstr, sizeof(outstr), "%Y-%m-%d %H:%M:%S", tmp) == 0)
errx(1, "strftime returned 0");
printf("Correct checkpoint loaded from %s. Time = %s\n", CHKPT_FILENAME, outstr);
printf("Tasks done per MPI rank: %lld.\n", chkpt.done);
*done = chkpt.done;
return GOOD_CHECKPOINT;
}
void save_chkpt(const u64 *X, u64 done)
{
struct checkpoint_t chkpt;
int size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
/* prepare checkpoint data */
chkpt.nranks = size;
chkpt.known_bits = known_low;
for (int i = 0; i < nbiter; i++)
chkpt.X[i] = X[i];
chkpt.done = done;
chkpt.when = time(NULL);
/* try to open alternate checkpoint file */
FILE *f = fopen(ALT_CHKPT_FILENAME, "w");
if (f == NULL) {
perror("WARNING ! Cannot open temporary checkpoint file");
return;
}
size_t check = fwrite(&chkpt, sizeof(chkpt), 1, f);
fclose(f);
if (check != 1) {
perror("WARNING ! Cannot write temporary checkpoint file");
return;
}
/* writing the new checkpoint was successful: we erase an eventual old one. */
if (rename(ALT_CHKPT_FILENAME, CHKPT_FILENAME) != 0)
perror("WARNING ! Cannot rename tmp checkpoint file");
}
/* invoked at the beginning. Sets the range for the current MPI rank. */
void restart(const u64 *X, u64 *range_start, u64 *range_end, u64 *done)
{
int rank, size;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
/* default */
u64 n_tasks = 1 << (2 * known_low - 1);
u64 tasks_per_rank = n_tasks / size;
if (n_tasks % size != 0)
tasks_per_rank += 1;
*range_start = rank * tasks_per_rank;
*range_end = (rank + 1) * tasks_per_rank;
if (rank == 0) {
enum chkpt_status status = load_chkpt(size, X, done);
switch (status) {
case BAD_CHECKPOINT:
printf("BAD CHECKPOINT. Refusing to start. Please clean up the mess\n");
exit(EXIT_FAILURE);
case NO_CHECKPOINT:
printf("COLD START.\n");
break;
case GOOD_CHECKPOINT:
printf("WARM START.\n");
break;
}
}
MPI_Bcast(done, 1, MPI_UNSIGNED_LONG_LONG, 0, MPI_COMM_WORLD);
*range_start += *done;
/* clip */
if (*range_end > n_tasks)
*range_end = n_tasks;
if (VERBOSE)
printf("MPI rank %d : [%llx:%llx]\n", rank, *range_start, *range_end);
}
/* checkpoints the current MPI rank */
void checkpoint(const u64 *X, u64 done)
{
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
/* synchronize everybody */
MPI_Barrier(MPI_COMM_WORLD);
if (rank == 0)
save_chkpt(X, done);
}
void do_task(u64 current, struct task_t *task, const u64 *X)
{
u64 W0 = current >> known_low;
u64 WC = current % (1 << known_low);
u64 W0 = current >> (known_low - 1);
u64 WC = 1 + 2 * (current % (1 << (known_low - 1)));
printf("Doing task %lld / %lld\n", W0, WC);
prepare_task(X, W0, WC, task);
......@@ -32,8 +189,13 @@ void do_task(u64 current, struct task_t *task, const u64 *X)
}
int main()
int main(int argc, char **argv)
{
int provided;
MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);
if (provided < MPI_THREAD_FUNNELED)
errx(1, "MPI Thread support not sufficient");
/* INITIALISATION DES PARAMETRES */
init_var_globales();
......@@ -44,10 +206,8 @@ int main()
pcg128_t vraiS[nboutput];
pcg(vraiS, X, S0, &c, nboutput);
// restart();
u64 range_start = (5018 << known_low) + 335;
u64 range_end = range_start + 10;
u64 range_start, range_end, done;
restart(X, &range_start, &range_end, &done);
double t1 = wtime();
......@@ -60,25 +220,25 @@ int main()
init_task(&task[tid]);
}
u64 current = range_start;
while (current < range_end) {
while (range_start < range_end) {
#pragma omp parallel
{
int tid = omp_get_thread_num();
if (current + tid < range_end) {
do_task(current + tid, &task[tid], X);
if (range_start + tid < range_end) {
do_task(range_start + tid, &task[tid], X);
}
}
// checkpoint();
current += T;
range_start += T;
done += T;
checkpoint(X, done);
}
/*if(DS640 == 7304601715607344736u){
printf("On a le bon !\n");
printf("DS640 = %llu\n", DS640);
}*/
printf("temps total = %f\n", wtime() - t1);
MPI_Finalize();
return(0);
}
Sur ppti-gpu-1 (44 coeurs skylake 6152 (2017) @ 2.1Ghz) :
vs jean-zay : cascade lake 6248 (2019) @ 2.5Ghz
known_low = 11
==============
Avec ICC
88 tâches en même temps --> 40.3s (2341 Msolve/s, 26.7 Msolve/s/thread)
44 tâches en même temps --> 26.9s (1753 Msolve/s, 39.8M Msolve/s/thread)
22 tâches en même temps --> 20.1s (1175 Msolve/s, 53.4 Msolve/s/thread)
1 tâches en même temps --> 17.8s (60.5 Msolve/s/thread)
et maintenant, on teste la parallélisation à l'intérieur des tâches
88 threads --> 10.9s, 1574.0M/s, 17.9M/s/thread
44 threads --> 20.53s, 836.7M/s, 19.0M/s/thread
22 threads --> 32.88s, 522.5M/s, 23.8M/s/thread
11 threads --> 58.52s, 293.6M/s, 26.7M/s/thread
c'est moins bien !
2048^2 / 80 tâches de 33.6s == 52 428 "jobs" de 33.6s --> 20000 h*CPU sur jz
known_low = 12
==============
80000 h*CPU sur jz
known_low = 13
==============
320000 h*CPU sur jz
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment