From d4a5f4af20cbfdb68f1b0d5e1b7db782b3004ed2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Wed, 3 Feb 2016 01:16:04 +0100 Subject: [PATCH 01/93] Adding support for travis and coverity scan --- .travis.yml | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 000000000..a59b617ce --- /dev/null +++ b/.travis.yml @@ -0,0 +1,36 @@ +language: c + +env: + global: + - BUILD_DIR="$TRAVIS_BUILD_DIR/build" + - secure: "Dw0r4cbZoT0QoIlI7XWLdhPQxL62CQzskdw3tUth4eSWJd/0IIvLq/ogaZZbYWGH4lifI4p5NMzEpHJ5yFjWnO/ZcpVTZ26SJEFwfBrPUzeu3MV30poX89EYxWz+dE4xPFKzAAKnemlcVxfTHE+z3vW5klnjg5bRH3+pFKoFVBX6M2ABPXcigkpbd1dnXqQK+36v0yx/SF4Yg607vLLPVfcmcU+ohT52OENVkoeEllOxmGxdVfdskEVQ9O16BbtxNSthYQFplvFn5RrpvknRarby51E+7eMf9sOey2H6MWtZYFKXKiascUz5ZLPIt0mFCW1x2wFC39cxMZNFURg+hHlGKUGV9Bt7B7K/XAwvPhY6X+td4OYPR2B68vywxpPlShiTg8iduYg1H4bdVzt+1qdRHsg2qmyJe5eQBMwW5fEVOffJhfVAiG5ceDWzQJQt5cEKn/2yxt1bX3sC7Hl6lzTQ1X6DuLWi9cCsKqsAB7ht/ZC1m13YGRyjghG9RDsRfM2FPy0w0iOHDAGrNzQVPPDDxxgNi/xpmqPlkvjRSz3Pmj6VeAZ2cG68pls+oKyzcWEhCeSL3uT3ShKBfrpQcFxVFzsdTWzH6XYul/HHrUnYKGtd3K1MW59Dng5WcD5xi8n76/wmlkXUoWmYNjfGsP5wwcz/yLfc/CRDRB3P4jo=" + +addons: + coverity_scan: + project: + name: sclaomir/fti + description: Fault Tolerance Interface + notification_email: scan_notifications@example.com + build_command_prepend: cmake .. + build_command: make + branch_pattern: devel + +os: + - linux + +compiler: + - gcc + +cache: + apt: true + +before_install: + - sudo apt-get update -qq + - sudo apt-get install -y openmpi-bin libopenmpi-dev gfortran + +before_script: + - mkdir $BUILD_DIR + - cd $BUILD_DIR + - cmake .. + +script: make From 2a1c9165f196d0180cd8cc32183af756545dab43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Mon, 1 Feb 2016 18:59:03 +0100 Subject: [PATCH 02/93] applying clang-format 'webkit' style to the fti source files --- src/api.c | 273 ++++++++++++--------------- src/checkpoint.c | 156 +++++++--------- src/conf.c | 203 +++++++++----------- src/ftif.c | 13 +- src/meta.c | 114 +++++------ src/postckpt.c | 193 ++++++++++--------- src/postreco.c | 478 ++++++++++++++++++++++++++++++++--------------- src/recover.c | 161 ++++++++-------- src/tools.c | 129 ++++++------- src/topo.c | 242 ++++++++++-------------- 10 files changed, 995 insertions(+), 967 deletions(-) diff --git a/src/api.c b/src/api.c index f60d08b97..3d16b35ac 100644 --- a/src/api.c +++ b/src/api.c @@ -5,16 +5,13 @@ * @brief API functions for the FTI library. */ - #include "fti.h" - /** Array of datasets and all their internal information. */ -static FTIT_dataset FTI_Data[FTI_BUFS]; +static FTIT_dataset FTI_Data[FTI_BUFS]; /** SDC injection model and all the required information. */ -static FTIT_injection FTI_Inje; - +static FTIT_injection FTI_Inje; /*-------------------------------------------------------------------------*/ /** @@ -24,14 +21,14 @@ static FTIT_injection FTI_Inje; **/ /*-------------------------------------------------------------------------*/ -void FTI_Abort() { - FTI_Clean(5, 0, FTI_Topo.myRank); - MPI_Abort(MPI_COMM_WORLD, -1); - MPI_Finalize(); - exit(1); +void FTI_Abort() +{ + FTI_Clean(5, 0, FTI_Topo.myRank); + MPI_Abort(MPI_COMM_WORLD, -1); + MPI_Finalize(); + exit(1); } - /*-------------------------------------------------------------------------*/ /** @brief Initializes FTI. @@ -46,27 +43,30 @@ void FTI_Abort() { **/ /*-------------------------------------------------------------------------*/ -int FTI_Init(char *configFile, MPI_Comm globalComm) { +int FTI_Init(char* configFile, MPI_Comm globalComm) +{ FTI_Exec.globalComm = globalComm; MPI_Comm_rank(FTI_Exec.globalComm, &FTI_Topo.myRank); MPI_Comm_size(FTI_Exec.globalComm, &FTI_Topo.nbProc); snprintf(FTI_Conf.cfgFile, FTI_BUFS, "%s", configFile); FTI_Conf.verbosity = 1; FTI_Inje.timer = MPI_Wtime(); - FTI_COMM_WORLD = globalComm; // Temporary before building topology - FTI_Topo.splitRank = FTI_Topo.myRank; // Temporary before building topology + FTI_COMM_WORLD = globalComm; // Temporary before building topology + FTI_Topo.splitRank = FTI_Topo.myRank; // Temporary before building topology int res = FTI_Try(FTI_LoadConf(&FTI_Inje), "load configuration."); - if (res == FTI_NSCS) FTI_Abort(); + if (res == FTI_NSCS) + FTI_Abort(); res = FTI_Try(FTI_Topology(), "build topology."); - if (res == FTI_NSCS) FTI_Abort(); + if (res == FTI_NSCS) + FTI_Abort(); FTI_Try(FTI_InitBasicTypes(FTI_Data), "create the basic data types."); - if (FTI_Topo.myRank == 0) FTI_Try(FTI_UpdateConf(1), "update configuration file."); - if (FTI_Topo.amIaHead) - { // If I am a FTI dedicated process - if (FTI_Exec.reco) - { + if (FTI_Topo.myRank == 0) + FTI_Try(FTI_UpdateConf(1), "update configuration file."); + if (FTI_Topo.amIaHead) { // If I am a FTI dedicated process + if (FTI_Exec.reco) { res = FTI_Try(FTI_RecoverFiles(), "recover the checkpoint files."); - if (res == FTI_NSCS) FTI_Abort(); + if (res == FTI_NSCS) + FTI_Abort(); } res = 0; while (res != FTI_ENDW) { @@ -74,11 +74,12 @@ int FTI_Init(char *configFile, MPI_Comm globalComm) { } FTI_Print("Head stopped listening.", FTI_DBUG); FTI_Finalize(); - } else { // If I am an application process - if (FTI_Exec.reco) - { + } + else { // If I am an application process + if (FTI_Exec.reco) { res = FTI_Try(FTI_RecoverFiles(), "recover the checkpoint files."); - if (res == FTI_NSCS) FTI_Abort(); + if (res == FTI_NSCS) + FTI_Abort(); FTI_Exec.ckptCnt = FTI_Exec.ckptID; } } @@ -86,7 +87,6 @@ int FTI_Init(char *configFile, MPI_Comm globalComm) { return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief It returns the current status of the recovery flag. @@ -96,11 +96,11 @@ int FTI_Init(char *configFile, MPI_Comm globalComm) { **/ /*-------------------------------------------------------------------------*/ -int FTI_Status() { +int FTI_Status() +{ return FTI_Exec.reco; } - /*-------------------------------------------------------------------------*/ /** @brief It initializes a data type. @@ -113,14 +113,14 @@ int FTI_Status() { **/ /*-------------------------------------------------------------------------*/ -int FTI_InitType(FTIT_type *type, int size) { +int FTI_InitType(FTIT_type* type, int size) +{ type->id = FTI_Exec.nbType; type->size = size; FTI_Exec.nbType = FTI_Exec.nbType + 1; return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief It sets/resets the pointer and type to a protected variable. @@ -139,32 +139,30 @@ int FTI_InitType(FTIT_type *type, int size) { **/ /*-------------------------------------------------------------------------*/ -int FTI_Protect(int id, void *ptr, long count, FTIT_type type) { +int FTI_Protect(int id, void* ptr, long count, FTIT_type type) +{ int i, prevSize, updated = 0; char str[FTI_BUFS]; float ckptSize; - for (i = 0; i < FTI_BUFS; i++) - { - if (id == FTI_Data[i].id) - { + for (i = 0; i < FTI_BUFS; i++) { + if (id == FTI_Data[i].id) { prevSize = FTI_Data[i].size; FTI_Data[i].ptr = ptr; FTI_Data[i].count = count; FTI_Data[i].type = type; FTI_Data[i].eleSize = type.size; - FTI_Data[i].size = type.size*count; - FTI_Exec.ckptSize = FTI_Exec.ckptSize + (type.size*count) - prevSize; + FTI_Data[i].size = type.size * count; + FTI_Exec.ckptSize = FTI_Exec.ckptSize + (type.size * count) - prevSize; updated = 1; } } - if (updated) - { - ckptSize = FTI_Exec.ckptSize/(1024.0*1024.0); + if (updated) { + ckptSize = FTI_Exec.ckptSize / (1024.0 * 1024.0); sprintf(str, "Variable ID %d reseted. Current ckpt. size per rank is %.2fMB.", id, ckptSize); FTI_Print(str, FTI_DBUG); - } else { - if (FTI_Exec.nbVar >= FTI_BUFS) - { + } + else { + if (FTI_Exec.nbVar >= FTI_BUFS) { FTI_Print("Too many variables registered.", FTI_EROR); FTI_Clean(5, FTI_Topo.groupID, FTI_Topo.myRank); MPI_Abort(MPI_COMM_WORLD, -1); @@ -176,17 +174,16 @@ int FTI_Protect(int id, void *ptr, long count, FTIT_type type) { FTI_Data[FTI_Exec.nbVar].count = count; FTI_Data[FTI_Exec.nbVar].type = type; FTI_Data[FTI_Exec.nbVar].eleSize = type.size; - FTI_Data[FTI_Exec.nbVar].size = type.size*count; + FTI_Data[FTI_Exec.nbVar].size = type.size * count; FTI_Exec.nbVar = FTI_Exec.nbVar + 1; - FTI_Exec.ckptSize = FTI_Exec.ckptSize + (type.size*count); - ckptSize = FTI_Exec.ckptSize/(1024.0*1024.0); + FTI_Exec.ckptSize = FTI_Exec.ckptSize + (type.size * count); + ckptSize = FTI_Exec.ckptSize / (1024.0 * 1024.0); sprintf(str, "Variable ID %d to protect. Current ckpt. size per rank is %.2fMB.", id, ckptSize); FTI_Print(str, FTI_INFO); } return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief It corrupts a bit of the given float. @@ -198,22 +195,20 @@ int FTI_Protect(int id, void *ptr, long count, FTIT_type type) { **/ /*-------------------------------------------------------------------------*/ -int FTI_FloatBitFlip(float *target, int bit) +int FTI_FloatBitFlip(float* target, int bit) { - if (bit >= 32 || bit < 0) - { + if (bit >= 32 || bit < 0) { return FTI_NSCS; } - int *corIntPtr = (int*)target; + int* corIntPtr = (int*)target; int corInt = *corIntPtr; corInt = corInt ^ (1 << bit); corIntPtr = &corInt; - float *fp = (float*)corIntPtr; + float* fp = (float*)corIntPtr; *target = *fp; return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief It corrupts a bit of the given float. @@ -225,22 +220,20 @@ int FTI_FloatBitFlip(float *target, int bit) **/ /*-------------------------------------------------------------------------*/ -int FTI_DoubleBitFlip(double *target, int bit) +int FTI_DoubleBitFlip(double* target, int bit) { - if (bit >= 64 || bit < 0) - { + if (bit >= 64 || bit < 0) { return FTI_NSCS; } FTIT_double myDouble; myDouble.value = *target; - int bitf = (bit >= 32) ? bit-32 : bit; + int bitf = (bit >= 32) ? bit - 32 : bit; int half = (bit >= 32) ? 1 : 0; FTI_FloatBitFlip(&(myDouble.floatval[half]), bitf); *target = myDouble.value; return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief Bit-flip injection following the injection instructions. @@ -254,40 +247,33 @@ int FTI_DoubleBitFlip(double *target, int bit) /*-------------------------------------------------------------------------*/ int FTI_BitFlip(int datasetID) { - if (FTI_Inje.rank == FTI_Topo.splitRank) - { - if (datasetID >= FTI_Exec.nbVar) - { + if (FTI_Inje.rank == FTI_Topo.splitRank) { + if (datasetID >= FTI_Exec.nbVar) { return FTI_NSCS; } - if (FTI_Inje.counter < FTI_Inje.number) - { - if ((MPI_Wtime()-FTI_Inje.timer) > FTI_Inje.frequency) - { - if (FTI_Inje.index < FTI_Data[datasetID].count) - { + if (FTI_Inje.counter < FTI_Inje.number) { + if ((MPI_Wtime() - FTI_Inje.timer) > FTI_Inje.frequency) { + if (FTI_Inje.index < FTI_Data[datasetID].count) { char str[FTI_BUFS]; - if (FTI_Data[datasetID].type.id == 9) - { // If it is a double - double *target = FTI_Data[datasetID].ptr+FTI_Inje.index; + if (FTI_Data[datasetID].type.id == 9) { // If it is a double + double* target = FTI_Data[datasetID].ptr + FTI_Inje.index; double ori = *target; int res = FTI_DoubleBitFlip(target, FTI_Inje.position); FTI_Inje.counter = (res == FTI_SCES) ? FTI_Inje.counter + 1 : FTI_Inje.counter; FTI_Inje.timer = (res == FTI_SCES) ? MPI_Wtime() : FTI_Inje.timer; sprintf(str, "Injecting bit-flip in dataset %d, index %d, bit %d : %f => %f", - datasetID, FTI_Inje.index, FTI_Inje.position, ori, *target); + datasetID, FTI_Inje.index, FTI_Inje.position, ori, *target); FTI_Print(str, FTI_WARN); return res; } - if (FTI_Data[datasetID].type.id == 8) - { // If it is a float - float *target = FTI_Data[datasetID].ptr+FTI_Inje.index; + if (FTI_Data[datasetID].type.id == 8) { // If it is a float + float* target = FTI_Data[datasetID].ptr + FTI_Inje.index; float ori = *target; int res = FTI_FloatBitFlip(target, FTI_Inje.position); FTI_Inje.counter = (res == FTI_SCES) ? FTI_Inje.counter + 1 : FTI_Inje.counter; FTI_Inje.timer = (res == FTI_SCES) ? MPI_Wtime() : FTI_Inje.timer; sprintf(str, "Injecting bit-flip in dataset %d, index %d, bit %d : %f => %f", - datasetID, FTI_Inje.index, FTI_Inje.position, ori, *target); + datasetID, FTI_Inje.index, FTI_Inje.position, ori, *target); FTI_Print(str, FTI_WARN); return res; } @@ -298,7 +284,6 @@ int FTI_BitFlip(int datasetID) return FTI_NSCS; } - /*-------------------------------------------------------------------------*/ /** @brief It takes the checkpoint and triggers the post-ckpt. work. @@ -313,23 +298,21 @@ int FTI_BitFlip(int datasetID) **/ /*-------------------------------------------------------------------------*/ -int FTI_Checkpoint(int id, int level) { +int FTI_Checkpoint(int id, int level) +{ int i, res = FTI_NSCS; double t0, t1, t2, t3, t4; char str[FTI_BUFS]; MPI_Status status; - if ((level > 0) && (level < 5)) - { + if ((level > 0) && (level < 5)) { t0 = MPI_Wtime(); FTI_Exec.ckptID = id; FTI_Exec.ckptLvel = level; sprintf(str, "Ckpt. ID %d", FTI_Exec.ckptID); - sprintf(str, "%s (L%d) (%.2f MB/proc)", str, FTI_Exec.ckptLvel, FTI_Exec.ckptSize/(1024.0*1024.0)); - if (FTI_Exec.wasLastOffline == 1) - { // Block until previous checkpoint is done (Async. work) + sprintf(str, "%s (L%d) (%.2f MB/proc)", str, FTI_Exec.ckptLvel, FTI_Exec.ckptSize / (1024.0 * 1024.0)); + if (FTI_Exec.wasLastOffline == 1) { // Block until previous checkpoint is done (Async. work) MPI_Recv(&res, 1, MPI_INT, FTI_Topo.headRank, FTI_Conf.tag, FTI_Exec.globalComm, &status); - if (res == FTI_SCES) - { + if (res == FTI_SCES) { FTI_Exec.lastCkptLvel = res; FTI_Exec.wasLastOffline = 1; FTI_Exec.lastCkptLvel = FTI_Exec.ckptLvel; @@ -339,37 +322,38 @@ int FTI_Checkpoint(int id, int level) { res = FTI_Try(FTI_WriteCkpt(FTI_Data), "write the checkpoint."); //MPI_Allreduce(&res, &tres, 1, MPI_INT, MPI_SUM, FTI_COMM_WORLD); t2 = MPI_Wtime(); - if (!FTI_Ckpt[FTI_Exec.ckptLvel].isInline) - { // If postCkpt. work is Async. then send message.. + if (!FTI_Ckpt[FTI_Exec.ckptLvel].isInline) { // If postCkpt. work is Async. then send message.. FTI_Exec.wasLastOffline = 1; - if (res != FTI_SCES) - { + if (res != FTI_SCES) { res = FTI_REJW; - } else { + } + else { res = FTI_BASE + FTI_Exec.ckptLvel; } MPI_Send(&res, 1, MPI_INT, FTI_Topo.headRank, FTI_Conf.tag, FTI_Exec.globalComm); - } else { + } + else { FTI_Exec.wasLastOffline = 0; - if (res != FTI_SCES) FTI_Exec.ckptLvel = FTI_REJW-FTI_BASE; + if (res != FTI_SCES) + FTI_Exec.ckptLvel = FTI_REJW - FTI_BASE; res = FTI_Try(FTI_PostCkpt(FTI_Topo.groupID, -1, 1), "postprocess the checkpoint."); - if (res == FTI_SCES) - { + if (res == FTI_SCES) { FTI_Exec.wasLastOffline = 0; FTI_Exec.lastCkptLvel = FTI_Exec.ckptLvel; } } t3 = MPI_Wtime(); - sprintf(str, "%s taken in %.2f sec.", str, t3-t0); - sprintf(str, "%s (Wt:%.2fs, Wr:%.2fs, Ps:%.2fs)", str, t1-t0, t2-t1, t3-t2); + sprintf(str, "%s taken in %.2f sec.", str, t3 - t0); + sprintf(str, "%s (Wt:%.2fs, Wr:%.2fs, Ps:%.2fs)", str, t1 - t0, t2 - t1, t3 - t2); FTI_Print(str, FTI_INFO); - if (res == FTI_SCES) res = FTI_DONE; - else res = FTI_NSCS; + if (res == FTI_SCES) + res = FTI_DONE; + else + res = FTI_NSCS; } return res; } - /*-------------------------------------------------------------------------*/ /** @brief It loads the checkpoint data. @@ -380,30 +364,27 @@ int FTI_Checkpoint(int id, int level) { **/ /*-------------------------------------------------------------------------*/ -int FTI_Recover() { +int FTI_Recover() +{ char fn[FTI_BUFS], str[FTI_BUFS]; - FILE *fd; + FILE* fd; int i; - sprintf(fn,"%s/%s" ,FTI_Ckpt[FTI_Exec.ckptLvel].dir, FTI_Exec.ckptFile); + sprintf(fn, "%s/%s", FTI_Ckpt[FTI_Exec.ckptLvel].dir, FTI_Exec.ckptFile); sprintf(str, "Trying to load FTI checkpoint file (%s)...", fn); FTI_Print(str, FTI_DBUG); - if (access(fn, F_OK) != 0) - { + if (access(fn, F_OK) != 0) { FTI_Print("FTI checkpoint file is NOT accesible.", FTI_EROR); return FTI_NSCS; } fd = fopen(fn, "rb"); - if (fd == NULL) - { + if (fd == NULL) { FTI_Print("Could not open FTI checkpoint file.", FTI_EROR); return FTI_NSCS; } - for(i = 0; i < FTI_Exec.nbVar; i++) - { + for (i = 0; i < FTI_Exec.nbVar; i++) { fread(FTI_Data[i].ptr, 1, FTI_Data[i].size, fd); } - if (fclose(fd) != 0) - { + if (fclose(fd) != 0) { FTI_Print("Could not close FTI checkpoint file.", FTI_EROR); return FTI_NSCS; } @@ -411,7 +392,6 @@ int FTI_Recover() { return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief Takes an FTI snapshot or recover the data if it is a restart. @@ -425,36 +405,32 @@ int FTI_Recover() { **/ /*-------------------------------------------------------------------------*/ -int FTI_Snapshot() { +int FTI_Snapshot() +{ int i, res, level = -1; - if (FTI_Exec.reco) - { // If this is a recovery load icheckpoint data + if (FTI_Exec.reco) { // If this is a recovery load icheckpoint data res = FTI_Try(FTI_Recover(), "recover the checkpointed data."); - if (res == FTI_NSCS) - { + if (res == FTI_NSCS) { FTI_Print("Impossible to load the checkpoint data.", FTI_EROR); FTI_Clean(5, FTI_Topo.groupID, FTI_Topo.myRank); MPI_Abort(MPI_COMM_WORLD, -1); MPI_Finalize(); exit(1); } - } else { // If it is a checkpoint test + } + else { // If it is a checkpoint test res = FTI_SCES; FTI_UpdateIterTime(); - if (FTI_Exec.ckptNext == FTI_Exec.ckptIcnt) - { // If it is time to check for possible ckpt. (every minute) + if (FTI_Exec.ckptNext == FTI_Exec.ckptIcnt) { // If it is time to check for possible ckpt. (every minute) FTI_Print("Checking if it is time to checkpoint.", FTI_DBUG); FTI_Exec.ckptCnt++; // Increment minute counter - for (i = 1; i < 5; i++) - { // Check ckpt. level - if (FTI_Exec.ckptCnt % FTI_Ckpt[i].ckptIntv == 0) - { + for (i = 1; i < 5; i++) { // Check ckpt. level + if (FTI_Exec.ckptCnt % FTI_Ckpt[i].ckptIntv == 0) { level = i; } } - if (level != -1) - { - res = FTI_Try(FTI_Checkpoint(FTI_Exec.ckptCnt, level), "take checkpoint."); + if (level != -1) { + res = FTI_Try(FTI_Checkpoint(FTI_Exec.ckptCnt, level), "take checkpoint."); } FTI_Exec.ckptLast = FTI_Exec.ckptNext; FTI_Exec.ckptNext = FTI_Exec.ckptNext + FTI_Exec.ckptIntv; @@ -464,7 +440,6 @@ int FTI_Snapshot() { return res; } - /*-------------------------------------------------------------------------*/ /** @brief It closes FTI properly on the application processes. @@ -476,50 +451,43 @@ int FTI_Snapshot() { **/ /*-------------------------------------------------------------------------*/ -int FTI_Finalize() { - if (!FTI_Topo.amIaHead) - { +int FTI_Finalize() +{ + if (!FTI_Topo.amIaHead) { int buff = FTI_ENDW; MPI_Status status; - if (FTI_Exec.wasLastOffline == 1) - { // If there is remaining work to do for last checkpoint + if (FTI_Exec.wasLastOffline == 1) { // If there is remaining work to do for last checkpoint MPI_Recv(&buff, 1, MPI_INT, FTI_Topo.headRank, FTI_Conf.tag, FTI_Exec.globalComm, &status); - if (buff != FTI_NSCS) - { + if (buff != FTI_NSCS) { FTI_Exec.ckptLvel = buff; FTI_Exec.wasLastOffline = 1; FTI_Exec.lastCkptLvel = FTI_Exec.ckptLvel; } } buff = FTI_ENDW; - if (FTI_Topo.nbHeads == 1) - { // Send notice to the head to stop listening + if (FTI_Topo.nbHeads == 1) { // Send notice to the head to stop listening MPI_Send(&buff, 1, MPI_INT, FTI_Topo.headRank, FTI_Conf.tag, FTI_Exec.globalComm); } - if (FTI_Conf.saveLastCkpt) - { // If we need to keep the last checkpoint - if (FTI_Exec.lastCkptLvel != 4) - { + if (FTI_Conf.saveLastCkpt) { // If we need to keep the last checkpoint + if (FTI_Exec.lastCkptLvel != 4) { FTI_Try(FTI_Flush(FTI_Topo.groupID, FTI_Exec.lastCkptLvel), "save the last ckpt. in the PFS."); MPI_Barrier(FTI_COMM_WORLD); - if (FTI_Topo.splitRank == 0) - { - if (access(FTI_Ckpt[4].dir,0)==0) + if (FTI_Topo.splitRank == 0) { + if (access(FTI_Ckpt[4].dir, 0) == 0) FTI_RmDir(FTI_Ckpt[4].dir, 1); - if (access(FTI_Ckpt[4].metaDir,0)==0) + if (access(FTI_Ckpt[4].metaDir, 0) == 0) FTI_RmDir(FTI_Ckpt[4].metaDir, 1); rename(FTI_Ckpt[FTI_Exec.lastCkptLvel].metaDir, FTI_Ckpt[4].metaDir); rename(FTI_Conf.gTmpDir, FTI_Ckpt[4].dir); } } - if (FTI_Topo.splitRank == 0) - { + if (FTI_Topo.splitRank == 0) { FTI_Try(FTI_UpdateConf(2), "update configuration file to 2."); } buff = 6; // For cleaning only local storage - } else { - if (FTI_Topo.splitRank == 0) - { + } + else { + if (FTI_Topo.splitRank == 0) { FTI_Try(FTI_UpdateConf(0), "update configuration file to 0."); } buff = 5; // For cleaning everything @@ -527,12 +495,11 @@ int FTI_Finalize() { MPI_Barrier(FTI_Exec.globalComm); FTI_Try(FTI_Clean(buff, FTI_Topo.groupID, FTI_Topo.myRank), "do final clean."); FTI_Print("FTI has been finalized.", FTI_INFO); - } else { + } + else { MPI_Barrier(FTI_Exec.globalComm); MPI_Finalize(); exit(0); } return FTI_SCES; } - - diff --git a/src/checkpoint.c b/src/checkpoint.c index 38dc87f3d..6edd15976 100644 --- a/src/checkpoint.c +++ b/src/checkpoint.c @@ -5,10 +5,8 @@ * @brief Checkpointing functions for the FTI library. */ - #include "fti.h" - /*-------------------------------------------------------------------------*/ /** @brief It updates the local and global mean iteration time. @@ -20,37 +18,34 @@ **/ /*-------------------------------------------------------------------------*/ -int FTI_UpdateIterTime() { +int FTI_UpdateIterTime() +{ int nbProcs, res; char str[FTI_BUFS]; double last = FTI_Exec.iterTime; FTI_Exec.iterTime = MPI_Wtime(); - if (FTI_Exec.ckptIcnt > 0) - { + if (FTI_Exec.ckptIcnt > 0) { FTI_Exec.lastIterTime = FTI_Exec.iterTime - last; FTI_Exec.totalIterTime = FTI_Exec.totalIterTime + FTI_Exec.lastIterTime; - if (FTI_Exec.ckptIcnt % FTI_Exec.syncIter == 0) - { + if (FTI_Exec.ckptIcnt % FTI_Exec.syncIter == 0) { FTI_Exec.meanIterTime = FTI_Exec.totalIterTime / FTI_Exec.ckptIcnt; MPI_Allreduce(&FTI_Exec.meanIterTime, &FTI_Exec.globMeanIter, 1, MPI_DOUBLE, MPI_SUM, FTI_COMM_WORLD); MPI_Comm_size(FTI_COMM_WORLD, &nbProcs); - FTI_Exec.globMeanIter = FTI_Exec.globMeanIter/nbProcs; - if (FTI_Exec.globMeanIter > 60) - { + FTI_Exec.globMeanIter = FTI_Exec.globMeanIter / nbProcs; + if (FTI_Exec.globMeanIter > 60) { FTI_Exec.ckptIntv = 1; - } else { - FTI_Exec.ckptIntv = (1*60)/FTI_Exec.globMeanIter; + } + else { + FTI_Exec.ckptIntv = (1 * 60) / FTI_Exec.globMeanIter; } res = FTI_Exec.ckptLast + FTI_Exec.ckptIntv; - if (res >= FTI_Exec.ckptIcnt) - { + if (res >= FTI_Exec.ckptIcnt) { FTI_Exec.ckptNext = res; } - if (FTI_Exec.syncIter < (FTI_Exec.ckptIntv/2)) - { + if (FTI_Exec.syncIter < (FTI_Exec.ckptIntv / 2)) { FTI_Exec.syncIter = FTI_Exec.syncIter * 2; sprintf(str, "Iteration frequency : %.2f sec/iter => %d iter/min. Resync every %d iter.", - FTI_Exec.globMeanIter, FTI_Exec.ckptIntv, FTI_Exec.syncIter); + FTI_Exec.globMeanIter, FTI_Exec.ckptIntv, FTI_Exec.syncIter); FTI_Print(str, FTI_DBUG); } } @@ -59,7 +54,6 @@ int FTI_UpdateIterTime() { return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief It writes the checkpoint data in the target file. @@ -72,53 +66,48 @@ int FTI_UpdateIterTime() { **/ /*-------------------------------------------------------------------------*/ -int FTI_WriteCkpt(FTIT_dataset* FTI_Data) { +int FTI_WriteCkpt(FTIT_dataset* FTI_Data) +{ int i, res; - FILE *fd; + FILE* fd; double tt = MPI_Wtime(); char fn[FTI_BUFS], str[FTI_BUFS]; snprintf(FTI_Exec.ckptFile, FTI_BUFS, "Ckpt%d-Rank%d.fti", FTI_Exec.ckptID, FTI_Topo.myRank); - if (FTI_Ckpt[4].isInline && FTI_Exec.ckptLvel == 4) - { - sprintf(fn,"%s/%s",FTI_Conf.gTmpDir, FTI_Exec.ckptFile); + if (FTI_Ckpt[4].isInline && FTI_Exec.ckptLvel == 4) { + sprintf(fn, "%s/%s", FTI_Conf.gTmpDir, FTI_Exec.ckptFile); mkdir(FTI_Conf.gTmpDir, 0777); - } else { - sprintf(fn,"%s/%s",FTI_Conf.lTmpDir, FTI_Exec.ckptFile); + } + else { + sprintf(fn, "%s/%s", FTI_Conf.lTmpDir, FTI_Exec.ckptFile); mkdir(FTI_Conf.lTmpDir, 0777); } fd = fopen(fn, "wb"); - if (fd == NULL) - { + if (fd == NULL) { FTI_Print("FTI checkpoint file could not be opened.", FTI_EROR); return FTI_NSCS; } - for(i = 0; i < FTI_Exec.nbVar; i++) - { - if (fwrite(FTI_Data[i].ptr, FTI_Data[i].eleSize, FTI_Data[i].count, fd) != FTI_Data[i].count) - { + for (i = 0; i < FTI_Exec.nbVar; i++) { + if (fwrite(FTI_Data[i].ptr, FTI_Data[i].eleSize, FTI_Data[i].count, fd) != FTI_Data[i].count) { sprintf(str, "Dataset #%d could not be written.", FTI_Data[i].id); FTI_Print(str, FTI_EROR); return FTI_NSCS; } } - if (fflush(fd) != 0) - { + if (fflush(fd) != 0) { FTI_Print("FTI checkpoint file could not be flushed.", FTI_EROR); return FTI_NSCS; } - if (fclose(fd) != 0) - { + if (fclose(fd) != 0) { FTI_Print("FTI checkpoint file could not be flushed.", FTI_EROR); return FTI_NSCS; } - sprintf(str, "Time writing checkpoint file : %f seconds.", MPI_Wtime()-tt); + sprintf(str, "Time writing checkpoint file : %f seconds.", MPI_Wtime() - tt); FTI_Print(str, FTI_DBUG); int globalTmp = (FTI_Ckpt[4].isInline && FTI_Exec.ckptLvel == 4) ? 1 : 0; res = FTI_Try(FTI_CreateMetadata(globalTmp), "create metadata."); return res; } - /*-------------------------------------------------------------------------*/ /** @brief Decides wich action start depending on the ckpt. level. @@ -135,22 +124,21 @@ int FTI_WriteCkpt(FTIT_dataset* FTI_Data) { **/ /*-------------------------------------------------------------------------*/ -int FTI_GroupClean(int level, int group, int pr) { +int FTI_GroupClean(int level, int group, int pr) +{ int i, rank; - if (level == 0) - { + if (level == 0) { FTI_Print("Error postprocessing checkpoint. Discarding checkpoint...", FTI_WARN); } rank = FTI_Topo.myRank; - for(i = 0; i < pr; i++) - { - if (FTI_Topo.amIaHead) rank = FTI_Topo.body[i]; - FTI_Clean(level, i+group, rank); + for (i = 0; i < pr; i++) { + if (FTI_Topo.amIaHead) + rank = FTI_Topo.body[i]; + FTI_Clean(level, i + group, rank); } return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief Decides wich action start depending on the ckpt. level. @@ -167,60 +155,63 @@ int FTI_GroupClean(int level, int group, int pr) { **/ /*-------------------------------------------------------------------------*/ -int FTI_PostCkpt(int group, int fo, int pr) { +int FTI_PostCkpt(int group, int fo, int pr) +{ int i, tres, res, level, nodeFlag, globalFlag = FTI_Topo.splitRank; double t0, t1, t2, t3; char str[FTI_BUFS]; t0 = MPI_Wtime(); - res = (FTI_Exec.ckptLvel == (FTI_REJW-FTI_BASE)) ? FTI_NSCS : FTI_SCES; + res = (FTI_Exec.ckptLvel == (FTI_REJW - FTI_BASE)) ? FTI_NSCS : FTI_SCES; MPI_Allreduce(&res, &tres, 1, MPI_INT, MPI_SUM, FTI_COMM_WORLD); - if (tres != FTI_SCES) - { + if (tres != FTI_SCES) { FTI_GroupClean(0, group, pr); return FTI_NSCS; } t1 = MPI_Wtime(); - for(i = 0; i < pr; i++) { - switch(FTI_Exec.ckptLvel) { - case 4 : res += FTI_Flush(i+group, fo); break; - case 3 : res += FTI_RSenc(i+group); break; - case 2 : res += FTI_Ptner(i+group); break; - case 1 : res += FTI_Local(i+group); break; + for (i = 0; i < pr; i++) { + switch (FTI_Exec.ckptLvel) { + case 4: + res += FTI_Flush(i + group, fo); + break; + case 3: + res += FTI_RSenc(i + group); + break; + case 2: + res += FTI_Ptner(i + group); + break; + case 1: + res += FTI_Local(i + group); + break; } } MPI_Allreduce(&res, &tres, 1, MPI_INT, MPI_SUM, FTI_COMM_WORLD); - if (tres != FTI_SCES) - { + if (tres != FTI_SCES) { FTI_GroupClean(0, group, pr); return FTI_NSCS; } t2 = MPI_Wtime(); FTI_GroupClean(FTI_Exec.ckptLvel, group, pr); MPI_Barrier(FTI_COMM_WORLD); - nodeFlag = (((!FTI_Topo.amIaHead) && (FTI_Topo.nodeRank == 0)) || (FTI_Topo.amIaHead))? 1 : 0; - if (nodeFlag) - { + nodeFlag = (((!FTI_Topo.amIaHead) && (FTI_Topo.nodeRank == 0)) || (FTI_Topo.amIaHead)) ? 1 : 0; + if (nodeFlag) { level = (FTI_Exec.ckptLvel != 4) ? FTI_Exec.ckptLvel : 1; rename(FTI_Conf.lTmpDir, FTI_Ckpt[level].dir); FTI_Print("Local directory renamed", FTI_DBUG); } - if (!globalFlag) - { - if (FTI_Exec.ckptLvel == 4) - { + if (!globalFlag) { + if (FTI_Exec.ckptLvel == 4) { rename(FTI_Conf.gTmpDir, FTI_Ckpt[FTI_Exec.ckptLvel].dir); } rename(FTI_Conf.mTmpDir, FTI_Ckpt[FTI_Exec.ckptLvel].metaDir); } t3 = MPI_Wtime(); - sprintf(str, "Post-checkpoint took %.2f sec.", t3-t0); - sprintf(str, "%s (Ag:%.2fs, Pt:%.2fs, Cl:%.2fs)", str, t1-t0, t2-t1, t3-t2); + sprintf(str, "Post-checkpoint took %.2f sec.", t3 - t0); + sprintf(str, "%s (Ag:%.2fs, Pt:%.2fs, Cl:%.2fs)", str, t1 - t0, t2 - t1, t3 - t2); FTI_Print(str, FTI_INFO); return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief It listens for checkpoint notifications. @@ -233,50 +224,41 @@ int FTI_PostCkpt(int group, int fo, int pr) { **/ /*-------------------------------------------------------------------------*/ -int FTI_Listen() { +int FTI_Listen() +{ MPI_Status status; char str[FTI_BUFS]; int i, buf, res, flags[7]; - for (i = 0; i < 7; i++) - { // Initialize flags + for (i = 0; i < 7; i++) { // Initialize flags flags[i] = 0; } FTI_Print("Head listening...", FTI_DBUG); - for(i = 0; i < FTI_Topo.nbApprocs; i++) - { // Iterate on the application processes in the node + for (i = 0; i < FTI_Topo.nbApprocs; i++) { // Iterate on the application processes in the node MPI_Recv(&buf, 1, MPI_INT, FTI_Topo.body[i], FTI_Conf.tag, FTI_Exec.globalComm, &status); sprintf(str, "The head received a %d message", buf); FTI_Print(str, FTI_DBUG); fflush(stdout); - flags[buf-FTI_BASE] = flags[buf-FTI_BASE] + 1; + flags[buf - FTI_BASE] = flags[buf - FTI_BASE] + 1; } - for (i = 1; i < 7; i++) - { - if (flags[i] == FTI_Topo.nbApprocs) - { // Determining checkpoint level + for (i = 1; i < 7; i++) { + if (flags[i] == FTI_Topo.nbApprocs) { // Determining checkpoint level FTI_Exec.ckptLvel = i; } } - if (flags[6] > 0) - { + if (flags[6] > 0) { FTI_Exec.ckptLvel = 6; } - if (FTI_Exec.ckptLvel == 5) - { // If we were asked to finalize + if (FTI_Exec.ckptLvel == 5) { // If we were asked to finalize return FTI_ENDW; } res = FTI_Try(FTI_PostCkpt(1, 0, FTI_Topo.nbApprocs), "postprocess the checkpoint."); - if (res == FTI_SCES) - { + if (res == FTI_SCES) { FTI_Exec.wasLastOffline = 1; FTI_Exec.lastCkptLvel = FTI_Exec.ckptLvel; res = FTI_Exec.ckptLvel; } - for(i = 0; i < FTI_Topo.nbApprocs; i++) - { // Send msg. to avoid checkpoint collision + for (i = 0; i < FTI_Topo.nbApprocs; i++) { // Send msg. to avoid checkpoint collision MPI_Send(&res, 1, MPI_INT, FTI_Topo.body[i], FTI_Conf.tag, FTI_Exec.globalComm); } return FTI_SCES; } - - diff --git a/src/conf.c b/src/conf.c index 8bf4cb711..dcccd3a58 100644 --- a/src/conf.c +++ b/src/conf.c @@ -5,10 +5,8 @@ * @brief Configuration loading functions for the FTI library. */ - #include "fti.h" - /*-------------------------------------------------------------------------*/ /** @brief Set the exec. ID and failure parameters in the conf. file. @@ -24,34 +22,31 @@ **/ /*-------------------------------------------------------------------------*/ -int FTI_UpdateConf(int restart) { +int FTI_UpdateConf(int restart) +{ char str[FTI_BUFS]; - dictionary *ini; + dictionary* ini; ini = iniparser_load(FTI_Conf.cfgFile); // Load dictionary sprintf(str, "Updating configuration file (%s)...", FTI_Conf.cfgFile); FTI_Print(str, FTI_DBUG); - if (ini == NULL) - { + if (ini == NULL) { FTI_Print("Iniparser failed to parse the conf. file.", FTI_WARN); return FTI_NSCS; } sprintf(str, "%d", restart); iniparser_set(ini, "Restart:failure", str); // Set failure to 'restart' iniparser_set(ini, "Restart:exec_id", FTI_Exec.id); // Set the exec. ID - FILE *fd = fopen(FTI_Conf.cfgFile, "w"); - if (fd == NULL) - { + FILE* fd = fopen(FTI_Conf.cfgFile, "w"); + if (fd == NULL) { FTI_Print("FTI failed to open the configuration file.", FTI_EROR); return FTI_NSCS; } iniparser_dump_ini(ini, fd); // Write new configuration - if (fflush(fd) != 0) - { + if (fflush(fd) != 0) { FTI_Print("FTI failed to flush the configuration file.", FTI_EROR); return FTI_NSCS; } - if (fclose(fd) != 0) - { + if (fclose(fd) != 0) { FTI_Print("FTI failed to close the configuration file.", FTI_EROR); return FTI_NSCS; } @@ -59,7 +54,6 @@ int FTI_UpdateConf(int restart) { return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief It reads the configuration given in the configuration file. @@ -70,20 +64,19 @@ int FTI_UpdateConf(int restart) { **/ /*-------------------------------------------------------------------------*/ -int FTI_ReadConf(FTIT_injection *FTI_Inje) { +int FTI_ReadConf(FTIT_injection* FTI_Inje) +{ // Check access to FTI configuration file and load dictionary - dictionary *ini; + dictionary* ini; char *par, str[FTI_BUFS]; sprintf(str, "Reading FTI configuration file (%s)...", FTI_Conf.cfgFile); FTI_Print(str, FTI_INFO); - if (access(FTI_Conf.cfgFile, F_OK) != 0) - { + if (access(FTI_Conf.cfgFile, F_OK) != 0) { FTI_Print("FTI configuration file NOT accessible.", FTI_WARN); return FTI_NSCS; } ini = iniparser_load(FTI_Conf.cfgFile); - if (ini == NULL) - { + if (ini == NULL) { FTI_Print("Iniparser failed to parse the conf. file.", FTI_WARN); return FTI_NSCS; } @@ -95,21 +88,21 @@ int FTI_ReadConf(FTIT_injection *FTI_Inje) { snprintf(FTI_Conf.glbalDir, FTI_BUFS, "%s", par); par = iniparser_getstring(ini, "Basic:meta_dir", NULL); snprintf(FTI_Conf.metadDir, FTI_BUFS, "%s", par); - FTI_Ckpt[1].ckptIntv = (int) iniparser_getint(ini, "Basic:ckpt_l1", -1); - FTI_Ckpt[2].ckptIntv = (int) iniparser_getint(ini, "Basic:ckpt_l2", -1); - FTI_Ckpt[3].ckptIntv = (int) iniparser_getint(ini, "Basic:ckpt_l3", -1); - FTI_Ckpt[4].ckptIntv = (int) iniparser_getint(ini, "Basic:ckpt_l4", -1); - FTI_Ckpt[1].isInline = (int) 1; - FTI_Ckpt[2].isInline = (int) iniparser_getint(ini, "Basic:inline_l2", 1); - FTI_Ckpt[3].isInline = (int) iniparser_getint(ini, "Basic:inline_l3", 1); - FTI_Ckpt[4].isInline = (int) iniparser_getint(ini, "Basic:inline_l4", 1); + FTI_Ckpt[1].ckptIntv = (int)iniparser_getint(ini, "Basic:ckpt_l1", -1); + FTI_Ckpt[2].ckptIntv = (int)iniparser_getint(ini, "Basic:ckpt_l2", -1); + FTI_Ckpt[3].ckptIntv = (int)iniparser_getint(ini, "Basic:ckpt_l3", -1); + FTI_Ckpt[4].ckptIntv = (int)iniparser_getint(ini, "Basic:ckpt_l4", -1); + FTI_Ckpt[1].isInline = (int)1; + FTI_Ckpt[2].isInline = (int)iniparser_getint(ini, "Basic:inline_l2", 1); + FTI_Ckpt[3].isInline = (int)iniparser_getint(ini, "Basic:inline_l3", 1); + FTI_Ckpt[4].isInline = (int)iniparser_getint(ini, "Basic:inline_l4", 1); // Reading/setting configuration metadata - FTI_Conf.verbosity = (int) iniparser_getint(ini, "Basic:verbosity", -1); - FTI_Conf.saveLastCkpt = (int) iniparser_getint(ini, "Basic:keep_last_ckpt", 0); - FTI_Conf.blockSize = (int) iniparser_getint(ini, "Advanced:block_size", -1) * 1024; - FTI_Conf.tag = (int) iniparser_getint(ini, "Advanced:mpi_tag", -1); - FTI_Conf.test = (int) iniparser_getint(ini, "Advanced:local_test", -1); + FTI_Conf.verbosity = (int)iniparser_getint(ini, "Basic:verbosity", -1); + FTI_Conf.saveLastCkpt = (int)iniparser_getint(ini, "Basic:keep_last_ckpt", 0); + FTI_Conf.blockSize = (int)iniparser_getint(ini, "Advanced:block_size", -1) * 1024; + FTI_Conf.tag = (int)iniparser_getint(ini, "Advanced:mpi_tag", -1); + FTI_Conf.test = (int)iniparser_getint(ini, "Advanced:local_test", -1); FTI_Conf.l3WordSize = FTI_WORD; // Reading/setting execution metadata @@ -128,17 +121,17 @@ int FTI_ReadConf(FTIT_injection *FTI_Inje) { FTI_Exec.lastIterTime = 0; FTI_Exec.totalIterTime = 0; FTI_Exec.meanIterTime = 0; - FTI_Exec.reco = (int) iniparser_getint(ini, "restart:failure", 0); - if (FTI_Exec.reco == 0) - { + FTI_Exec.reco = (int)iniparser_getint(ini, "restart:failure", 0); + if (FTI_Exec.reco == 0) { time_t tim = time(NULL); - struct tm *n = localtime(&tim); + struct tm* n = localtime(&tim); snprintf(FTI_Exec.id, FTI_BUFS, "%d-%02d-%02d_%02d-%02d-%02d", - n->tm_year+1900, n->tm_mon+1, n->tm_mday, n->tm_hour, n->tm_min, n->tm_sec); + n->tm_year + 1900, n->tm_mon + 1, n->tm_mday, n->tm_hour, n->tm_min, n->tm_sec); MPI_Bcast(FTI_Exec.id, FTI_BUFS, MPI_CHAR, 0, FTI_Exec.globalComm); sprintf(str, "The execution ID is: %s", FTI_Exec.id); FTI_Print(str, FTI_INFO); - } else { + } + else { par = iniparser_getstring(ini, "restart:exec_id", NULL); snprintf(FTI_Exec.id, FTI_BUFS, "%s", par); sprintf(str, "This is a restart. The execution ID is: %s", FTI_Exec.id); @@ -146,18 +139,18 @@ int FTI_ReadConf(FTIT_injection *FTI_Inje) { } // Reading/setting topology metadata - FTI_Topo.nbHeads = (int) iniparser_getint(ini, "Basic:head", 0); - FTI_Topo.groupSize = (int) iniparser_getint(ini, "Basic:group_size", -1); - FTI_Topo.nodeSize = (int) iniparser_getint(ini, "Basic:node_size", -1); + FTI_Topo.nbHeads = (int)iniparser_getint(ini, "Basic:head", 0); + FTI_Topo.groupSize = (int)iniparser_getint(ini, "Basic:group_size", -1); + FTI_Topo.nodeSize = (int)iniparser_getint(ini, "Basic:node_size", -1); FTI_Topo.nbApprocs = FTI_Topo.nodeSize - FTI_Topo.nbHeads; FTI_Topo.nbNodes = FTI_Topo.nbProc / FTI_Topo.nodeSize; // Reading/setting injection parameters - FTI_Inje->rank = (int) iniparser_getint(ini, "Injection:rank", 0); - FTI_Inje->index = (int) iniparser_getint(ini, "Injection:index", 0); - FTI_Inje->position = (int) iniparser_getint(ini, "Injection:position", 0); - FTI_Inje->number = (int) iniparser_getint(ini, "Injection:number", 0); - FTI_Inje->frequency = (int) iniparser_getint(ini, "Injection:frequency", -1); + FTI_Inje->rank = (int)iniparser_getint(ini, "Injection:rank", 0); + FTI_Inje->index = (int)iniparser_getint(ini, "Injection:index", 0); + FTI_Inje->position = (int)iniparser_getint(ini, "Injection:position", 0); + FTI_Inje->number = (int)iniparser_getint(ini, "Injection:number", 0); + FTI_Inje->frequency = (int)iniparser_getint(ini, "Injection:frequency", -1); // Synchronize after config reading and free dictionary MPI_Barrier(FTI_Exec.globalComm); @@ -165,7 +158,6 @@ int FTI_ReadConf(FTIT_injection *FTI_Inje) { return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief It tests that the configuration given is correct. @@ -176,59 +168,51 @@ int FTI_ReadConf(FTIT_injection *FTI_Inje) { **/ /*-------------------------------------------------------------------------*/ -int FTI_TestConfig() { - if (FTI_Topo.nbHeads != 0 && FTI_Topo.nbHeads != 1) - { +int FTI_TestConfig() +{ + if (FTI_Topo.nbHeads != 0 && FTI_Topo.nbHeads != 1) { FTI_Print("The number of heads needs to be set to 0 or 1.", FTI_WARN); return FTI_NSCS; } - if (FTI_Topo.nbProc % FTI_Topo.nodeSize != 0) - { + if (FTI_Topo.nbProc % FTI_Topo.nodeSize != 0) { FTI_Print("Number of ranks is not a multiple of the node size.", FTI_WARN); return FTI_NSCS; } - if (FTI_Topo.nbNodes % FTI_Topo.groupSize != 0) - { + if (FTI_Topo.nbNodes % FTI_Topo.groupSize != 0) { FTI_Print("The group size is not multiple of the number of nodes.", FTI_WARN); return FTI_NSCS; } - if (FTI_Topo.groupSize <= 2) - { + if (FTI_Topo.groupSize <= 2) { FTI_Print("The group size must be bigger than 2", FTI_WARN); return FTI_NSCS; } - if (FTI_Topo.groupSize >= 32) - { + if (FTI_Topo.groupSize >= 32) { FTI_Print("The group size must be lower than 32", FTI_WARN); return FTI_NSCS; } - if (FTI_Conf.verbosity > 3 || FTI_Conf.verbosity < 1) - { + if (FTI_Conf.verbosity > 3 || FTI_Conf.verbosity < 1) { FTI_Print("Verbosity needs to be set to 1, 2 or 3.", FTI_WARN); return FTI_NSCS; } - if (FTI_Conf.blockSize > (2048*1024) || FTI_Conf.blockSize < (1*1024)) - { + if (FTI_Conf.blockSize > (2048 * 1024) || FTI_Conf.blockSize < (1 * 1024)) { FTI_Print("Block size needs to be set between 1 and 2048.", FTI_WARN); return FTI_NSCS; } - if (FTI_Conf.test != 0 && FTI_Conf.test != 1) - { + if (FTI_Conf.test != 0 && FTI_Conf.test != 1) { FTI_Print("Local test size needs to be set to 0 or 1.", FTI_WARN); return FTI_NSCS; } - if (FTI_Conf.saveLastCkpt != 0 && FTI_Conf.saveLastCkpt != 1) - { + if (FTI_Conf.saveLastCkpt != 0 && FTI_Conf.saveLastCkpt != 1) { FTI_Print("Keep last ckpt. needs to be set to 0 or 1.", FTI_WARN); return FTI_NSCS; } int l; - for (l = 1; l < 5; l++) - { - if (FTI_Ckpt[l].ckptIntv == 0) FTI_Ckpt[l].ckptIntv = -1; - if (FTI_Ckpt[l].isInline != 0 && FTI_Ckpt[l].isInline != 1) FTI_Ckpt[l].isInline = 1; - if (FTI_Ckpt[l].isInline == 0 && FTI_Topo.nbHeads != 1) - { + for (l = 1; l < 5; l++) { + if (FTI_Ckpt[l].ckptIntv == 0) + FTI_Ckpt[l].ckptIntv = -1; + if (FTI_Ckpt[l].isInline != 0 && FTI_Ckpt[l].isInline != 1) + FTI_Ckpt[l].isInline = 1; + if (FTI_Ckpt[l].isInline == 0 && FTI_Topo.nbHeads != 1) { FTI_Print("If inline is set to 0 then head should be set to 1.", FTI_WARN); return FTI_NSCS; } @@ -236,7 +220,6 @@ int FTI_TestConfig() { return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief It tests that the directories given is correct. @@ -247,45 +230,39 @@ int FTI_TestConfig() { **/ /*-------------------------------------------------------------------------*/ -int FTI_TestDirectories() { +int FTI_TestDirectories() +{ char str[FTI_BUFS]; // Checking local directory - if (access(FTI_Conf.localDir, W_OK) != 0) - { - sprintf(str,"Checking the local directory (%s)...", FTI_Conf.localDir); + if (access(FTI_Conf.localDir, W_OK) != 0) { + sprintf(str, "Checking the local directory (%s)...", FTI_Conf.localDir); FTI_Print(str, FTI_DBUG); FTI_Print("The local directory does not exist or has no write access.", FTI_DBUG); - if (mkdir(FTI_Conf.localDir, 0777) != 0) - { + if (mkdir(FTI_Conf.localDir, 0777) != 0) { FTI_Print("The local directory could NOT be created.", FTI_WARN); return FTI_NSCS; } } - if (FTI_Topo.myRank == 0) - { + if (FTI_Topo.myRank == 0) { // Checking metadata directory - sprintf(str,"Checking the metadata directory (%s)...", FTI_Conf.metadDir); + sprintf(str, "Checking the metadata directory (%s)...", FTI_Conf.metadDir); FTI_Print(str, FTI_DBUG); - if (access(FTI_Conf.metadDir, W_OK) != 0) - { + if (access(FTI_Conf.metadDir, W_OK) != 0) { FTI_Print("The metadata directory does not exist or has no write access.", FTI_DBUG); - if (mkdir(FTI_Conf.metadDir, 0777) != 0) - { + if (mkdir(FTI_Conf.metadDir, 0777) != 0) { FTI_Print("The metadata directory could NOT be created.", FTI_WARN); return FTI_NSCS; } } // Checking global directory - sprintf(str,"Checking the global directory (%s)...", FTI_Conf.glbalDir); + sprintf(str, "Checking the global directory (%s)...", FTI_Conf.glbalDir); FTI_Print(str, FTI_DBUG); - if (access(FTI_Conf.glbalDir, W_OK) != 0) - { + if (access(FTI_Conf.glbalDir, W_OK) != 0) { FTI_Print("The global directory does not exist or has no write access.", FTI_DBUG); - if (mkdir(FTI_Conf.glbalDir, 0777) != 0) - { + if (mkdir(FTI_Conf.glbalDir, 0777) != 0) { FTI_Print("The global directory could NOT be created.", FTI_WARN); return FTI_NSCS; } @@ -295,7 +272,6 @@ int FTI_TestDirectories() { return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief It creates the directories required for current execution. @@ -306,13 +282,13 @@ int FTI_TestDirectories() { **/ /*-------------------------------------------------------------------------*/ -int FTI_CreateDirs() { +int FTI_CreateDirs() +{ char fn[FTI_BUFS]; // Create metadata timestamp directory snprintf(fn, FTI_BUFS, "%s/%s", FTI_Conf.metadDir, FTI_Exec.id); - if (access(fn, F_OK) != 0) - { + if (access(fn, F_OK) != 0) { mkdir(fn, 0777); } snprintf(FTI_Conf.metadDir, FTI_BUFS, "%s", fn); @@ -325,27 +301,24 @@ int FTI_CreateDirs() { // Create global checkpoint timestamp directory snprintf(fn, FTI_BUFS, "%s", FTI_Conf.glbalDir); snprintf(FTI_Conf.glbalDir, FTI_BUFS, "%s/%s", fn, FTI_Exec.id); - if (access(FTI_Conf.glbalDir, F_OK) != 0) - { + if (access(FTI_Conf.glbalDir, F_OK) != 0) { mkdir(FTI_Conf.glbalDir, 0777); } snprintf(FTI_Conf.gTmpDir, FTI_BUFS, "%s/tmp", FTI_Conf.glbalDir); snprintf(FTI_Ckpt[4].dir, FTI_BUFS, "%s/l4", FTI_Conf.glbalDir); // Create local checkpoint timestamp directory - if (FTI_Conf.test) - { // If local test generate name by topology - snprintf(fn, FTI_BUFS, "%s/node%d", FTI_Conf.localDir, FTI_Topo.myRank/FTI_Topo.nodeSize); - if (access(fn, F_OK) != 0) - { + if (FTI_Conf.test) { // If local test generate name by topology + snprintf(fn, FTI_BUFS, "%s/node%d", FTI_Conf.localDir, FTI_Topo.myRank / FTI_Topo.nodeSize); + if (access(fn, F_OK) != 0) { mkdir(fn, 0777); } - } else { + } + else { snprintf(fn, FTI_BUFS, "%s", FTI_Conf.localDir); } snprintf(FTI_Conf.localDir, FTI_BUFS, "%s/%s", fn, FTI_Exec.id); - if (access(FTI_Conf.localDir, F_OK) != 0) - { + if (access(FTI_Conf.localDir, F_OK) != 0) { mkdir(FTI_Conf.localDir, 0777); } snprintf(FTI_Conf.lTmpDir, FTI_BUFS, "%s/tmp", FTI_Conf.localDir); @@ -355,7 +328,6 @@ int FTI_CreateDirs() { return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief It reads and tests the configuration given. @@ -366,34 +338,29 @@ int FTI_CreateDirs() { **/ /*-------------------------------------------------------------------------*/ -int FTI_LoadConf(FTIT_injection *FTI_Inje) { +int FTI_LoadConf(FTIT_injection* FTI_Inje) +{ int res; res = FTI_Try(FTI_ReadConf(FTI_Inje), "read configuration."); - if (res == FTI_NSCS) - { + if (res == FTI_NSCS) { FTI_Print("Impossible to read configuration.", FTI_WARN); return FTI_NSCS; } res = FTI_Try(FTI_TestConfig(), "pass the configuration test."); - if (res == FTI_NSCS) - { + if (res == FTI_NSCS) { FTI_Print("Wrong configuration.", FTI_WARN); return FTI_NSCS; } res = FTI_Try(FTI_TestDirectories(), "pass the directories test."); - if (res == FTI_NSCS) - { + if (res == FTI_NSCS) { FTI_Print("Problem with the directories.", FTI_WARN); return FTI_NSCS; } res = FTI_Try(FTI_CreateDirs(), "create checkpoint directories."); - if (res == FTI_NSCS) - { + if (res == FTI_NSCS) { FTI_Print("Problem creating the directories.", FTI_WARN); return FTI_NSCS; } return FTI_SCES; } - - diff --git a/src/ftif.c b/src/ftif.c index 307a66c2a..c9b4ec3c1 100644 --- a/src/ftif.c +++ b/src/ftif.c @@ -20,12 +20,11 @@ */ int FTI_Init_fort_wrapper(char* configFile, int* globalComm) { - int ierr = FTI_Init(configFile, MPI_Comm_f2c(*globalComm)); - *globalComm = MPI_Comm_c2f(FTI_COMM_WORLD); - return ierr; + int ierr = FTI_Init(configFile, MPI_Comm_f2c(*globalComm)); + *globalComm = MPI_Comm_c2f(FTI_COMM_WORLD); + return ierr; } - /** * @brief Initializes a data type. * @param type The data type to be intialized. @@ -36,7 +35,8 @@ int FTI_Init_fort_wrapper(char* configFile, int* globalComm) * size of the data type, the rest is black box for FTI. * **/ -int FTI_InitType_wrapper(FTIT_type **type, int size) { +int FTI_InitType_wrapper(FTIT_type** type, int size) +{ *type = talloc(FTIT_type, 1); return FTI_InitType(*type, size); } @@ -56,6 +56,7 @@ int FTI_InitType_wrapper(FTIT_type **type, int size) { **/ /*-------------------------------------------------------------------------*/ -int FTI_Protect_wrapper(int id, void *ptr, long count, FTIT_type* type) { +int FTI_Protect_wrapper(int id, void* ptr, long count, FTIT_type* type) +{ return FTI_Protect(id, ptr, count, *type); } \ No newline at end of file diff --git a/src/meta.c b/src/meta.c index 3eed6120d..d618e4260 100644 --- a/src/meta.c +++ b/src/meta.c @@ -5,10 +5,8 @@ * @brief Metadata functions for the FTI library. */ - #include "fti.h" - /*-------------------------------------------------------------------------*/ /** @brief It gets the metadata to recover the data after a failure. @@ -24,32 +22,30 @@ **/ /*-------------------------------------------------------------------------*/ -int FTI_GetMeta(unsigned long *fs, unsigned long *mfs, int group, int level) { - dictionary *ini; +int FTI_GetMeta(unsigned long* fs, unsigned long* mfs, int group, int level) +{ + dictionary* ini; int res = -1, cnt = 3; char mfn[FTI_BUFS], str[FTI_BUFS], *cfn; - if(level == 0) - { - sprintf(mfn,"%s/sector%d-group%d.fti",FTI_Conf.mTmpDir, FTI_Topo.sectorID, group); + if (level == 0) { + sprintf(mfn, "%s/sector%d-group%d.fti", FTI_Conf.mTmpDir, FTI_Topo.sectorID, group); } else { - sprintf(mfn,"%s/sector%d-group%d.fti",FTI_Ckpt[level].metaDir, FTI_Topo.sectorID, group); + sprintf(mfn, "%s/sector%d-group%d.fti", FTI_Ckpt[level].metaDir, FTI_Topo.sectorID, group); } - sprintf(str, "Getting FTI metadata file (%s)...",mfn); + sprintf(str, "Getting FTI metadata file (%s)...", mfn); FTI_Print(str, FTI_DBUG); - while (( res != 0) && (cnt > 0)) - { + while ((res != 0) && (cnt > 0)) { FTI_Print("Checking FTI metadata file ...", FTI_DBUG); res = access(mfn, R_OK); cnt--; } - if (res != 0){ + if (res != 0) { FTI_Print("FTI metadata file NOT accessible.", FTI_DBUG); return FTI_NSCS; } ini = iniparser_load(mfn); - if (ini == NULL) - { + if (ini == NULL) { FTI_Print("Iniparser failed to parse the metadata file.", FTI_WARN); return FTI_NSCS; } @@ -57,14 +53,13 @@ int FTI_GetMeta(unsigned long *fs, unsigned long *mfs, int group, int level) { cfn = iniparser_getstring(ini, str, NULL); snprintf(FTI_Exec.ckptFile, FTI_BUFS, "%s", cfn); sprintf(str, "%d:Ckpt_file_size", FTI_Topo.groupRank); - *fs = (int) iniparser_getint(ini, str, -1); + *fs = (int)iniparser_getint(ini, str, -1); sprintf(str, "%d:Ckpt_file_maxs", FTI_Topo.groupRank); - *mfs = (int) iniparser_getint(ini, str, -1); + *mfs = (int)iniparser_getint(ini, str, -1); iniparser_freedict(ini); return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief It writes the metadata to recover the data after a failure. @@ -78,58 +73,53 @@ int FTI_GetMeta(unsigned long *fs, unsigned long *mfs, int group, int level) { **/ /*-------------------------------------------------------------------------*/ -int FTI_WriteMetadata(unsigned long *fs, unsigned long mfs, char* fnl) { +int FTI_WriteMetadata(unsigned long* fs, unsigned long mfs, char* fnl) +{ char str[FTI_BUFS], buf[FTI_BUFS]; - dictionary *ini; + dictionary* ini; int i; - snprintf(buf, FTI_BUFS, "%s/Topology.fti",FTI_Conf.metadDir); + snprintf(buf, FTI_BUFS, "%s/Topology.fti", FTI_Conf.metadDir); sprintf(str, "Temporary load of topology file (%s)...", buf); FTI_Print(str, FTI_DBUG); ini = iniparser_load(buf); // To bypass iniparser bug while empty dict. - if (ini == NULL) - { + if (ini == NULL) { FTI_Print("Temporary topology file could NOT be parsed", FTI_WARN); return FTI_NSCS; } - for (i = 0; i < FTI_Topo.groupSize; i++) - { // Add metadata to dictionary - strncpy(buf,fnl+(i*FTI_BUFS),FTI_BUFS); - sprintf(str,"%d", i); + for (i = 0; i < FTI_Topo.groupSize; i++) { // Add metadata to dictionary + strncpy(buf, fnl + (i * FTI_BUFS), FTI_BUFS); + sprintf(str, "%d", i); iniparser_set(ini, str, NULL); - sprintf(str,"%d:Ckpt_file_name", i); + sprintf(str, "%d:Ckpt_file_name", i); iniparser_set(ini, str, buf); - sprintf(str,"%d:Ckpt_file_size", i); - sprintf(buf,"%ld", fs[i]); + sprintf(str, "%d:Ckpt_file_size", i); + sprintf(buf, "%ld", fs[i]); iniparser_set(ini, str, buf); - sprintf(str,"%d:Ckpt_file_maxs", i); - sprintf(buf,"%ld", mfs); + sprintf(str, "%d:Ckpt_file_maxs", i); + sprintf(buf, "%ld", mfs); iniparser_set(ini, str, buf); } iniparser_unset(ini, "topology"); // Remove topology section - if (access(FTI_Conf.mTmpDir, F_OK) != 0) - { + if (access(FTI_Conf.mTmpDir, F_OK) != 0) { mkdir(FTI_Conf.mTmpDir, 0777); } sprintf(buf, "%s/sector%d-group%d.fti", FTI_Conf.mTmpDir, FTI_Topo.sectorID, FTI_Topo.groupID); remove(buf); sprintf(str, "Creating metadata file (%s)...", buf); FTI_Print(str, FTI_DBUG); - FILE *fd = fopen(buf, "w"); - if (fd == NULL) - { + FILE* fd = fopen(buf, "w"); + if (fd == NULL) { FTI_Print("Metadata file could NOT be opened.", FTI_WARN); iniparser_freedict(ini); return FTI_NSCS; } iniparser_dump_ini(ini, fd); // Write metadata - if (fflush(fd) != 0) - { + if (fflush(fd) != 0) { FTI_Print("Metadata file could NOT be flushed.", FTI_WARN); iniparser_freedict(ini); return FTI_NSCS; } - if (fclose(fd) != 0) - { + if (fclose(fd) != 0) { FTI_Print("Metadata file could NOT be closed.", FTI_WARN); iniparser_freedict(ini); return FTI_NSCS; @@ -138,7 +128,6 @@ int FTI_WriteMetadata(unsigned long *fs, unsigned long mfs, char* fnl) { return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief It writes the metadata to recover the data after a failure. @@ -151,48 +140,45 @@ int FTI_WriteMetadata(unsigned long *fs, unsigned long mfs, char* fnl) { **/ /*-------------------------------------------------------------------------*/ -int FTI_CreateMetadata(int globalTmp) { - char *fnl = talloc(char, FTI_Topo.groupSize*FTI_BUFS); +int FTI_CreateMetadata(int globalTmp) +{ + char* fnl = talloc(char, FTI_Topo.groupSize* FTI_BUFS); unsigned long fs[FTI_BUFS], mfs, tmpo; char str[FTI_BUFS], buf[FTI_BUFS]; struct stat fileStatus; int i; - if (globalTmp) - { - sprintf(buf,"%s/%s",FTI_Conf.gTmpDir, FTI_Exec.ckptFile); - } else { - sprintf(buf,"%s/%s",FTI_Conf.lTmpDir, FTI_Exec.ckptFile); - } - if(stat(buf, &fileStatus) == 0) - { // Getting size of files - fs[FTI_Topo.groupRank] = (unsigned long) fileStatus.st_size; - } else { + if (globalTmp) { + sprintf(buf, "%s/%s", FTI_Conf.gTmpDir, FTI_Exec.ckptFile); + } + else { + sprintf(buf, "%s/%s", FTI_Conf.lTmpDir, FTI_Exec.ckptFile); + } + if (stat(buf, &fileStatus) == 0) { // Getting size of files + fs[FTI_Topo.groupRank] = (unsigned long)fileStatus.st_size; + } + else { FTI_Print("Error with stat on the checkpoint file.", FTI_WARN); free(fnl); return FTI_NSCS; } sprintf(str, "Checkpoint file size : %ld bytes.", fs[FTI_Topo.groupRank]); FTI_Print(str, FTI_DBUG); - sprintf(fnl+(FTI_Topo.groupRank*FTI_BUFS),"%s",FTI_Exec.ckptFile); + sprintf(fnl + (FTI_Topo.groupRank * FTI_BUFS), "%s", FTI_Exec.ckptFile); tmpo = fs[FTI_Topo.groupRank]; // Gather all the file sizes MPI_Allgather(&tmpo, 1, MPI_UNSIGNED_LONG, fs, 1, MPI_UNSIGNED_LONG, FTI_Exec.groupComm); - strncpy(str,fnl+(FTI_Topo.groupRank*FTI_BUFS),FTI_BUFS); // Gather all the file names + strncpy(str, fnl + (FTI_Topo.groupRank * FTI_BUFS), FTI_BUFS); // Gather all the file names MPI_Allgather(str, FTI_BUFS, MPI_CHAR, fnl, FTI_BUFS, MPI_CHAR, FTI_Exec.groupComm); mfs = 0; - for(i = 0; i < FTI_Topo.groupSize; i++) - { - if (fs[i] > mfs) - { + for (i = 0; i < FTI_Topo.groupSize; i++) { + if (fs[i] > mfs) { mfs = fs[i]; // Search max. size } } sprintf(str, "Max. file size %ld.", mfs); FTI_Print(str, FTI_DBUG); - if (FTI_Topo.groupRank == 0) - { // Only one process in the group create the metadata + if (FTI_Topo.groupRank == 0) { // Only one process in the group create the metadata int res = FTI_Try(FTI_WriteMetadata(fs, mfs, fnl), "write the metadata."); - if (res == FTI_NSCS) - { + if (res == FTI_NSCS) { free(fnl); return FTI_NSCS; } @@ -200,5 +186,3 @@ int FTI_CreateMetadata(int globalTmp) { free(fnl); return FTI_SCES; } - - diff --git a/src/postckpt.c b/src/postckpt.c index 25aa2eb7f..9656ef5a0 100644 --- a/src/postckpt.c +++ b/src/postckpt.c @@ -5,10 +5,8 @@ * @brief Post-checkpointing functions for the FTI library. */ - #include "fti.h" - /*-------------------------------------------------------------------------*/ /** @brief It returns FTI_SCES. @@ -19,15 +17,16 @@ **/ /*-------------------------------------------------------------------------*/ -int FTI_Local(int group) { +int FTI_Local(int group) +{ unsigned long maxFs, fs; FTI_Print("Starting checkpoint post-processing L1", FTI_DBUG); int res = FTI_Try(FTI_GetMeta(&fs, &maxFs, group, 0), "obtain metadata."); - if (res == FTI_NSCS) return FTI_NSCS; + if (res == FTI_NSCS) + return FTI_NSCS; return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief It copies ckpt. files in to the partner node. @@ -40,42 +39,52 @@ int FTI_Local(int group) { **/ /*-------------------------------------------------------------------------*/ -int FTI_Ptner(int group) { - char *blBuf1, *blBuf2, lfn[FTI_BUFS], pfn[FTI_BUFS], str[FTI_BUFS]; +int FTI_Ptner(int group) +{ + char *blBuf1, *blBuf2, lfn[FTI_BUFS], pfn[FTI_BUFS], str[FTI_BUFS]; unsigned long maxFs, fs, ps, pos = 0; MPI_Request reqSend, reqRecv; - FILE *lfd, *pfd; - int res, dest, src, bSize = FTI_Conf.blockSize; - MPI_Status status; + FILE *lfd, *pfd; + int res, dest, src, bSize = FTI_Conf.blockSize; + MPI_Status status; FTI_Print("Starting checkpoint post-processing L2", FTI_DBUG); res = FTI_Try(FTI_GetMeta(&fs, &maxFs, group, 0), "obtain metadata."); - if (res == FTI_NSCS) return FTI_NSCS; - ps = (maxFs/FTI_Conf.blockSize)*FTI_Conf.blockSize; - if (ps < maxFs) ps = ps + FTI_Conf.blockSize; + if (res == FTI_NSCS) + return FTI_NSCS; + ps = (maxFs / FTI_Conf.blockSize) * FTI_Conf.blockSize; + if (ps < maxFs) + ps = ps + FTI_Conf.blockSize; sprintf(str, "Max. file size %ld and padding size %ld.", maxFs, ps); FTI_Print(str, FTI_DBUG); - sscanf(FTI_Exec.ckptFile,"Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &src); - sprintf(lfn,"%s/%s",FTI_Conf.lTmpDir, FTI_Exec.ckptFile); - sprintf(pfn,"%s/Ckpt%d-Pcof%d.fti", FTI_Conf.lTmpDir, FTI_Exec.ckptID, src); + sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &src); + sprintf(lfn, "%s/%s", FTI_Conf.lTmpDir, FTI_Exec.ckptFile); + sprintf(pfn, "%s/Ckpt%d-Pcof%d.fti", FTI_Conf.lTmpDir, FTI_Exec.ckptID, src); sprintf(str, "L2 trying to access local ckpt. file (%s).", lfn); FTI_Print(str, FTI_DBUG); res = FTI_Try(access(lfn, R_OK), " access the L2 checkpoint file."); - if (res == FTI_NSCS) return FTI_NSCS; + if (res == FTI_NSCS) + return FTI_NSCS; dest = FTI_Topo.right; src = FTI_Topo.left; lfd = fopen(lfn, "rb"); pfd = fopen(pfn, "wb"); - if (lfd == NULL) { FTI_Print("FTI failed to open L2 chckpt. file.", FTI_DBUG); return FTI_NSCS; } - if (pfd == NULL) { FTI_Print("FTI failed to open L2 partner file.", FTI_DBUG); return FTI_NSCS; } + if (lfd == NULL) { + FTI_Print("FTI failed to open L2 chckpt. file.", FTI_DBUG); + return FTI_NSCS; + } + if (pfd == NULL) { + FTI_Print("FTI failed to open L2 partner file.", FTI_DBUG); + return FTI_NSCS; + } blBuf1 = talloc(char, FTI_Conf.blockSize); blBuf2 = talloc(char, FTI_Conf.blockSize); - while(pos < ps) - { // Checkpoint files partner copy - if ((fs-pos) < FTI_Conf.blockSize) bSize = fs - pos; + while (pos < ps) { // Checkpoint files partner copy + if ((fs - pos) < FTI_Conf.blockSize) + bSize = fs - pos; fread(blBuf1, sizeof(char), bSize, lfd); MPI_Isend(blBuf1, FTI_Conf.blockSize, MPI_CHAR, dest, FTI_Conf.tag, FTI_Exec.groupComm, &reqSend); MPI_Irecv(blBuf2, FTI_Conf.blockSize, MPI_CHAR, src, FTI_Conf.tag, FTI_Exec.groupComm, &reqRecv); @@ -91,7 +100,6 @@ int FTI_Ptner(int group) { return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief It performs RS encoding with the ckpt. files in to the group. @@ -104,7 +112,8 @@ int FTI_Ptner(int group) { **/ /*-------------------------------------------------------------------------*/ -int FTI_RSenc(int group) { +int FTI_RSenc(int group) +{ char *myData, *data, *coding, lfn[FTI_BUFS], efn[FTI_BUFS], str[FTI_BUFS]; int *matrix, cnt, i, j, init, src, offset, dest, matVal, res, bs = FTI_Conf.blockSize; unsigned long maxFs, fs, ps, pos = 0; @@ -115,85 +124,81 @@ int FTI_RSenc(int group) { FTI_Print("Starting checkpoint post-processing L3", FTI_DBUG); res = FTI_Try(FTI_GetMeta(&fs, &maxFs, group, 0), "obtain metadata."); - if (res != FTI_SCES) return FTI_NSCS; - ps = ((maxFs/bs))*bs; - if (ps < maxFs) ps = ps + bs; + if (res != FTI_SCES) + return FTI_NSCS; + ps = ((maxFs / bs)) * bs; + if (ps < maxFs) + ps = ps + bs; - sscanf(FTI_Exec.ckptFile,"Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &i); - sprintf(lfn,"%s/%s",FTI_Conf.lTmpDir, FTI_Exec.ckptFile); - sprintf(efn,"%s/Ckpt%d-RSed%d.fti", FTI_Conf.lTmpDir, FTI_Exec.ckptID, i); + sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &i); + sprintf(lfn, "%s/%s", FTI_Conf.lTmpDir, FTI_Exec.ckptFile); + sprintf(efn, "%s/Ckpt%d-RSed%d.fti", FTI_Conf.lTmpDir, FTI_Exec.ckptID, i); sprintf(str, "L3 trying to access local ckpt. file (%s).", lfn); FTI_Print(str, FTI_DBUG); res = FTI_Try(access(lfn, R_OK), "access the L3 checkpoint file."); - if (res != FTI_SCES) return FTI_NSCS; + if (res != FTI_SCES) + return FTI_NSCS; lfd = fopen(lfn, "rb"); efd = fopen(efn, "wb"); - if (lfd == NULL) - { + if (lfd == NULL) { FTI_Print("FTI failed to open L3 checkpoint file.", FTI_EROR); return FTI_NSCS; } - if (efd == NULL) - { + if (efd == NULL) { FTI_Print("FTI failed to open encoded ckpt. file.", FTI_EROR); return FTI_NSCS; } myData = talloc(char, bs); coding = talloc(char, bs); - data = talloc(char, 2*bs); - matrix = talloc(int, FTI_Topo.groupSize*FTI_Topo.groupSize); + data = talloc(char, 2 * bs); + matrix = talloc(int, FTI_Topo.groupSize* FTI_Topo.groupSize); for (i = 0; i < FTI_Topo.groupSize; i++) { for (j = 0; j < FTI_Topo.groupSize; j++) { - matrix[i*FTI_Topo.groupSize+j] = - galois_single_divide(1, i ^ (FTI_Topo.groupSize + j), FTI_Conf.l3WordSize); + matrix[i * FTI_Topo.groupSize + j] = galois_single_divide(1, i ^ (FTI_Topo.groupSize + j), FTI_Conf.l3WordSize); } } - while(pos < ps) - { // For each block - if ((fs-pos) < bs) remBsize = fs-pos; + while (pos < ps) { // For each block + if ((fs - pos) < bs) + remBsize = fs - pos; fread(myData, sizeof(char), remBsize, lfd); // Reading checkpoint files dest = FTI_Topo.groupRank; i = FTI_Topo.groupRank; offset = 0; init = 0; cnt = 0; - while(cnt < FTI_Topo.groupSize) - { // For each encoding - if (cnt == 0) - { - memcpy(&(data[offset*bs]), myData, sizeof(char)*bs); - } else { + while (cnt < FTI_Topo.groupSize) { // For each encoding + if (cnt == 0) { + memcpy(&(data[offset * bs]), myData, sizeof(char) * bs); + } + else { MPI_Wait(&reqSend, &status); MPI_Wait(&reqRecv, &status); } - if (cnt != FTI_Topo.groupSize-1) - { // At every loop *but* the last one we send the data - dest = (dest+FTI_Topo.groupSize-1)%FTI_Topo.groupSize; - src = (i+1)%FTI_Topo.groupSize; + if (cnt != FTI_Topo.groupSize - 1) { // At every loop *but* the last one we send the data + dest = (dest + FTI_Topo.groupSize - 1) % FTI_Topo.groupSize; + src = (i + 1) % FTI_Topo.groupSize; MPI_Isend(myData, bs, MPI_CHAR, dest, FTI_Conf.tag, FTI_Exec.groupComm, &reqSend); - MPI_Irecv(&(data[(1-offset)*bs]), bs, MPI_CHAR, src, FTI_Conf.tag, FTI_Exec.groupComm, &reqRecv); + MPI_Irecv(&(data[(1 - offset) * bs]), bs, MPI_CHAR, src, FTI_Conf.tag, FTI_Exec.groupComm, &reqRecv); } - matVal = matrix[FTI_Topo.groupRank*FTI_Topo.groupSize+i]; - if (matVal == 1) - { // First copy or xor any data that does not need to be multiplied by a factor - if (init == 0) - { - memcpy(coding, &(data[offset*bs]), bs); + matVal = matrix[FTI_Topo.groupRank * FTI_Topo.groupSize + i]; + if (matVal == 1) { // First copy or xor any data that does not need to be multiplied by a factor + if (init == 0) { + memcpy(coding, &(data[offset * bs]), bs); init = 1; - } else { - galois_region_xor(&(data[offset*bs]), coding, coding, bs); + } + else { + galois_region_xor(&(data[offset * bs]), coding, coding, bs); } } - if (matVal != 0 && matVal != 1) - { // Then the data that needs to be multiplied by a factor - galois_w16_region_multiply(&(data[offset*bs]), matVal, bs, coding, init); + if (matVal != 0 && matVal != 1) { // Then the data that needs to be multiplied by a factor + galois_w16_region_multiply(&(data[offset * bs]), matVal, bs, coding, init); init = 1; } - i = (i+1)%FTI_Topo.groupSize; + i = (i + 1) % FTI_Topo.groupSize; offset = 1 - offset; cnt++; } @@ -212,7 +217,6 @@ int FTI_RSenc(int group) { return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief It flushes the local ckpt. files in to the PFS. @@ -224,51 +228,59 @@ int FTI_RSenc(int group) { **/ /*-------------------------------------------------------------------------*/ -int FTI_Flush(int group, int level) { - char lfn[FTI_BUFS], gfn[FTI_BUFS], str[FTI_BUFS], *blBuf1 = talloc(char, FTI_Conf.blockSize); +int FTI_Flush(int group, int level) +{ + char lfn[FTI_BUFS], gfn[FTI_BUFS], str[FTI_BUFS], *blBuf1 = talloc(char, FTI_Conf.blockSize); unsigned long maxFs, fs, ps, pos = 0, bSize = FTI_Conf.blockSize; - FILE *lfd, *gfd; - if (level == -1) return FTI_SCES; // Fake call for inline PFS checkpoint + FILE *lfd, *gfd; + if (level == -1) + return FTI_SCES; // Fake call for inline PFS checkpoint FTI_Print("Starting checkpoint post-processing L4", FTI_DBUG); int res = FTI_Try(FTI_GetMeta(&fs, &maxFs, group, level), "obtain metadata."); - if (res != FTI_SCES) return FTI_NSCS; + if (res != FTI_SCES) + return FTI_NSCS; if (access(FTI_Conf.gTmpDir, F_OK) != 0) { mkdir(FTI_Conf.gTmpDir, 0777); } - ps = (maxFs/FTI_Conf.blockSize)*FTI_Conf.blockSize; - if (ps < maxFs) ps = ps + FTI_Conf.blockSize; - switch(level) - { - case 0: sprintf(lfn,"%s/%s", FTI_Conf.lTmpDir, FTI_Exec.ckptFile); break; - case 1: sprintf(lfn,"%s/%s", FTI_Ckpt[1].dir, FTI_Exec.ckptFile); break; - case 2: sprintf(lfn,"%s/%s", FTI_Ckpt[2].dir, FTI_Exec.ckptFile); break; - case 3: sprintf(lfn,"%s/%s", FTI_Ckpt[3].dir, FTI_Exec.ckptFile); break; + ps = (maxFs / FTI_Conf.blockSize) * FTI_Conf.blockSize; + if (ps < maxFs) + ps = ps + FTI_Conf.blockSize; + switch (level) { + case 0: + sprintf(lfn, "%s/%s", FTI_Conf.lTmpDir, FTI_Exec.ckptFile); + break; + case 1: + sprintf(lfn, "%s/%s", FTI_Ckpt[1].dir, FTI_Exec.ckptFile); + break; + case 2: + sprintf(lfn, "%s/%s", FTI_Ckpt[2].dir, FTI_Exec.ckptFile); + break; + case 3: + sprintf(lfn, "%s/%s", FTI_Ckpt[3].dir, FTI_Exec.ckptFile); + break; } - sprintf(gfn,"%s/%s", FTI_Conf.gTmpDir, FTI_Exec.ckptFile); // Open and resize files + sprintf(gfn, "%s/%s", FTI_Conf.gTmpDir, FTI_Exec.ckptFile); // Open and resize files sprintf(str, "L4 trying to access local ckpt. file (%s).", lfn); FTI_Print(str, FTI_DBUG); - if (access(lfn, R_OK) != 0) - { + if (access(lfn, R_OK) != 0) { FTI_Print("L4 cannot access the checkpoint file.", FTI_EROR); return FTI_NSCS; } lfd = fopen(lfn, "rb"); - if (lfd == NULL) - { + if (lfd == NULL) { FTI_Print("L4 cannot open the checkpoint file.", FTI_EROR); return FTI_NSCS; } gfd = fopen(gfn, "wb"); - if (gfd == NULL) - { + if (gfd == NULL) { FTI_Print("L4 cannot open ckpt. file in the PFS.", FTI_EROR); return FTI_NSCS; } - while(pos < ps) - { // Checkpoint files exchange - if ((fs-pos) < FTI_Conf.blockSize) bSize = fs - pos; + while (pos < ps) { // Checkpoint files exchange + if ((fs - pos) < FTI_Conf.blockSize) + bSize = fs - pos; fread(blBuf1, sizeof(char), bSize, lfd); fwrite(blBuf1, sizeof(char), bSize, gfd); pos = pos + FTI_Conf.blockSize; @@ -277,6 +289,3 @@ int FTI_Flush(int group, int level) { fclose(gfd); return FTI_SCES; } - - - diff --git a/src/postreco.c b/src/postreco.c index 7b509c0a8..b295c269b 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -5,10 +5,8 @@ * @brief Post recovery functions for the FTI library. */ - #include "fti.h" - /*-------------------------------------------------------------------------*/ /** @brief Recover a set of ckpt. files using RS decoding. @@ -19,72 +17,128 @@ **/ /*-------------------------------------------------------------------------*/ -int FTI_Decode(int fs, int maxFs, int *erased) { +int FTI_Decode(int fs, int maxFs, int* erased) +{ int *matrix, *decMatrix, *dm_ids, *tmpmat, i, j, k, m, ps, bs, pos = 0; char **coding, **data, *dataTmp, fn[FTI_BUFS], efn[FTI_BUFS], str[FTI_BUFS]; FILE *fd, *efd; - bs = FTI_Conf.blockSize; k = FTI_Topo.groupSize; m = k; - ps = ((maxFs/FTI_Conf.blockSize))*FTI_Conf.blockSize; - if (ps < maxFs) ps = ps + FTI_Conf.blockSize; // Calculating padding size - if (access(FTI_Ckpt[3].dir, F_OK) != 0) mkdir(FTI_Ckpt[3].dir, 0777); - sscanf(FTI_Exec.ckptFile,"Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &i); - sprintf(fn,"%s/%s",FTI_Ckpt[3].dir, FTI_Exec.ckptFile); - sprintf(efn,"%s/Ckpt%d-RSed%d.fti", FTI_Ckpt[3].dir, FTI_Exec.ckptID, i); - data = talloc(char *, k); coding = talloc(char *, m); dataTmp = talloc(char, FTI_Conf.blockSize*k); - dm_ids = talloc(int, k); decMatrix = talloc(int, k*k); tmpmat = talloc(int, k*k); matrix = talloc(int, k*k); + bs = FTI_Conf.blockSize; + k = FTI_Topo.groupSize; + m = k; + ps = ((maxFs / FTI_Conf.blockSize)) * FTI_Conf.blockSize; + if (ps < maxFs) + ps = ps + FTI_Conf.blockSize; // Calculating padding size + if (access(FTI_Ckpt[3].dir, F_OK) != 0) + mkdir(FTI_Ckpt[3].dir, 0777); + sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &i); + sprintf(fn, "%s/%s", FTI_Ckpt[3].dir, FTI_Exec.ckptFile); + sprintf(efn, "%s/Ckpt%d-RSed%d.fti", FTI_Ckpt[3].dir, FTI_Exec.ckptID, i); + data = talloc(char*, k); + coding = talloc(char*, m); + dataTmp = talloc(char, FTI_Conf.blockSize* k); + dm_ids = talloc(int, k); + decMatrix = talloc(int, k* k); + tmpmat = talloc(int, k* k); + matrix = talloc(int, k* k); for (i = 0; i < FTI_Topo.groupSize; i++) { for (j = 0; j < FTI_Topo.groupSize; j++) { - matrix[i*FTI_Topo.groupSize+j] = - galois_single_divide(1, i ^ (FTI_Topo.groupSize + j), FTI_Conf.l3WordSize); + matrix[i * FTI_Topo.groupSize + j] = galois_single_divide(1, i ^ (FTI_Topo.groupSize + j), FTI_Conf.l3WordSize); } } for (i = 0; i < m; i++) { coding[i] = talloc(char, FTI_Conf.blockSize); data[i] = talloc(char, FTI_Conf.blockSize); } - j = 0; for (i = 0; j < k; i++) { if (erased[i] == 0) {dm_ids[j] = i; j++;} } + j = 0; + for (i = 0; j < k; i++) { + if (erased[i] == 0) { + dm_ids[j] = i; + j++; + } + } for (i = 0; i < k; i++) { // Building the matrix if (dm_ids[i] < k) { - for (j = 0; j < k; j++) tmpmat[i*k+j] = 0; - tmpmat[i*k+dm_ids[i]] = 1; - } else for (j = 0; j < k; j++) { tmpmat[i*k+j] = matrix[(dm_ids[i]-k)*k+j]; } + for (j = 0; j < k; j++) + tmpmat[i * k + j] = 0; + tmpmat[i * k + dm_ids[i]] = 1; + } + else + for (j = 0; j < k; j++) { + tmpmat[i * k + j] = matrix[(dm_ids[i] - k) * k + j]; + } } // Inversing the matrix - if (jerasure_invert_matrix(tmpmat, decMatrix, k, FTI_Conf.l3WordSize) < 0) - { FTI_Print("Error inversing matrix", FTI_DBUG); return FTI_NSCS; } - if(erased[FTI_Topo.groupRank] == 0) { // Resize and open files - if (truncate(fn,ps) == -1) { FTI_Print("Error with truncate on checkpoint file", FTI_DBUG); return FTI_NSCS; } - fd = fopen(fn, "rb"); efd = fopen(efn, "rb"); - } else { fd = fopen(fn, "wb"); efd = fopen(efn, "wb"); } - if (fd == NULL) { FTI_Print("R3 cannot open checkpoint file.", FTI_DBUG); return FTI_NSCS; } - if (efd == NULL) { FTI_Print("R3 cannot open encoded ckpt. file.", FTI_DBUG); return FTI_NSCS; } - while(pos < ps) { // Main loop, block by block - if(erased[FTI_Topo.groupRank] == 0) { // Reading the data - fread(data[FTI_Topo.groupRank]+0, sizeof(char), bs, fd); - fread(coding[FTI_Topo.groupRank]+0, sizeof(char), bs, efd); - } else { bzero(data[FTI_Topo.groupRank], bs); bzero(coding[FTI_Topo.groupRank], bs); } // Erasure found - MPI_Allgather(data[FTI_Topo.groupRank]+0, bs, MPI_CHAR, dataTmp, bs, MPI_CHAR, FTI_Exec.groupComm); - for (i = 0; i < k; i++) memcpy(data[i]+0, &(dataTmp[i*bs]), sizeof(char)*bs); - MPI_Allgather(coding[FTI_Topo.groupRank]+0, bs, MPI_CHAR, dataTmp, bs, MPI_CHAR, FTI_Exec.groupComm); - for (i = 0; i < k; i++) memcpy(coding[i]+0, &(dataTmp[i*bs]), sizeof(char)*bs); + if (jerasure_invert_matrix(tmpmat, decMatrix, k, FTI_Conf.l3WordSize) < 0) { + FTI_Print("Error inversing matrix", FTI_DBUG); + return FTI_NSCS; + } + if (erased[FTI_Topo.groupRank] == 0) { // Resize and open files + if (truncate(fn, ps) == -1) { + FTI_Print("Error with truncate on checkpoint file", FTI_DBUG); + return FTI_NSCS; + } + fd = fopen(fn, "rb"); + efd = fopen(efn, "rb"); + } + else { + fd = fopen(fn, "wb"); + efd = fopen(efn, "wb"); + } + if (fd == NULL) { + FTI_Print("R3 cannot open checkpoint file.", FTI_DBUG); + return FTI_NSCS; + } + if (efd == NULL) { + FTI_Print("R3 cannot open encoded ckpt. file.", FTI_DBUG); + return FTI_NSCS; + } + while (pos < ps) { // Main loop, block by block + if (erased[FTI_Topo.groupRank] == 0) { // Reading the data + fread(data[FTI_Topo.groupRank] + 0, sizeof(char), bs, fd); + fread(coding[FTI_Topo.groupRank] + 0, sizeof(char), bs, efd); + } + else { + bzero(data[FTI_Topo.groupRank], bs); + bzero(coding[FTI_Topo.groupRank], bs); + } // Erasure found + MPI_Allgather(data[FTI_Topo.groupRank] + 0, bs, MPI_CHAR, dataTmp, bs, MPI_CHAR, FTI_Exec.groupComm); + for (i = 0; i < k; i++) + memcpy(data[i] + 0, &(dataTmp[i * bs]), sizeof(char) * bs); + MPI_Allgather(coding[FTI_Topo.groupRank] + 0, bs, MPI_CHAR, dataTmp, bs, MPI_CHAR, FTI_Exec.groupComm); + for (i = 0; i < k; i++) + memcpy(coding[i] + 0, &(dataTmp[i * bs]), sizeof(char) * bs); if (erased[FTI_Topo.groupRank]) // Decoding the lost data work - jerasure_matrix_dotprod(k, FTI_Conf.l3WordSize, decMatrix+(FTI_Topo.groupRank*k), dm_ids, FTI_Topo.groupRank, data, coding, bs); - MPI_Allgather(data[FTI_Topo.groupRank]+0, bs, MPI_CHAR, dataTmp, bs, MPI_CHAR, FTI_Exec.groupComm); - for (i = 0; i < k; i++) memcpy(data[i]+0, &(dataTmp[i*bs]), sizeof(char)*bs); + jerasure_matrix_dotprod(k, FTI_Conf.l3WordSize, decMatrix + (FTI_Topo.groupRank * k), dm_ids, FTI_Topo.groupRank, data, coding, bs); + MPI_Allgather(data[FTI_Topo.groupRank] + 0, bs, MPI_CHAR, dataTmp, bs, MPI_CHAR, FTI_Exec.groupComm); + for (i = 0; i < k; i++) + memcpy(data[i] + 0, &(dataTmp[i * bs]), sizeof(char) * bs); if (erased[FTI_Topo.groupRank + k]) // Finally, re-encode any erased encoded checkpoint file - jerasure_matrix_dotprod(k, FTI_Conf.l3WordSize, matrix+(FTI_Topo.groupRank*k), NULL, FTI_Topo.groupRank+k, data, coding, bs); - if (erased[FTI_Topo.groupRank]) fwrite(data[FTI_Topo.groupRank]+0, sizeof(char), bs, fd); - if (erased[FTI_Topo.groupRank + k]) fwrite(coding[FTI_Topo.groupRank]+0, sizeof(char), bs, efd); + jerasure_matrix_dotprod(k, FTI_Conf.l3WordSize, matrix + (FTI_Topo.groupRank * k), NULL, FTI_Topo.groupRank + k, data, coding, bs); + if (erased[FTI_Topo.groupRank]) + fwrite(data[FTI_Topo.groupRank] + 0, sizeof(char), bs, fd); + if (erased[FTI_Topo.groupRank + k]) + fwrite(coding[FTI_Topo.groupRank] + 0, sizeof(char), bs, efd); pos = pos + bs; } - fclose(fd); fclose(efd); // Closing files - if (truncate(fn,fs) == -1) { FTI_Print("R3 cannot re-truncate checkpoint file.", FTI_DBUG); return FTI_NSCS; } - if (truncate(efn,fs) == -1) { FTI_Print("R3 cannot re-truncate encoded ckpt. file.", FTI_DBUG); return FTI_NSCS; } - free(tmpmat); free(dm_ids); free(decMatrix); free(matrix); free(data); free(dataTmp); free(coding); + fclose(fd); + fclose(efd); // Closing files + if (truncate(fn, fs) == -1) { + FTI_Print("R3 cannot re-truncate checkpoint file.", FTI_DBUG); + return FTI_NSCS; + } + if (truncate(efn, fs) == -1) { + FTI_Print("R3 cannot re-truncate encoded ckpt. file.", FTI_DBUG); + return FTI_NSCS; + } + free(tmpmat); + free(dm_ids); + free(decMatrix); + free(matrix); + free(data); + free(dataTmp); + free(coding); return FTI_SCES; } - - /*-------------------------------------------------------------------------*/ /** @brief Checks that all L1 ckpt. files are present. @@ -96,17 +150,25 @@ int FTI_Decode(int fs, int maxFs, int *erased) { **/ /*-------------------------------------------------------------------------*/ -int FTI_RecoverL1(int group) { - int erased[FTI_BUFS], buf, i, j; // FTI_BUFS > 32*3 +int FTI_RecoverL1(int group) +{ + int erased[FTI_BUFS], buf, i, j; // FTI_BUFS > 32*3 unsigned long fs, maxFs; - if (FTI_CheckErasures(&fs, &maxFs, group, erased, 1) != FTI_SCES) - { FTI_Print("Error checking erasures.", FTI_DBUG); return FTI_NSCS; } - buf = 0; for(j = 0; j < FTI_Topo.groupSize; j++) if(erased[j]) buf++; // Counting erasures - if (buf > 0) { FTI_Print("Checkpoint files missing at L1.", FTI_DBUG); return FTI_NSCS; } + if (FTI_CheckErasures(&fs, &maxFs, group, erased, 1) != FTI_SCES) { + FTI_Print("Error checking erasures.", FTI_DBUG); + return FTI_NSCS; + } + buf = 0; + for (j = 0; j < FTI_Topo.groupSize; j++) + if (erased[j]) + buf++; // Counting erasures + if (buf > 0) { + FTI_Print("Checkpoint files missing at L1.", FTI_DBUG); + return FTI_NSCS; + } return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief Recover L2 ckpt. files using the partner copy. @@ -119,14 +181,15 @@ int FTI_RecoverL1(int group) { **/ /*-------------------------------------------------------------------------*/ -int FTI_RecoverL2(int group) { - int erased[FTI_BUFS], gs, buf, j, src, dest; - char str[FTI_BUFS], lfn[FTI_BUFS], pfn[FTI_BUFS], jfn[FTI_BUFS], qfn[FTI_BUFS]; - char *blBuf1, *blBuf2, *blBuf3, *blBuf4; +int FTI_RecoverL2(int group) +{ + int erased[FTI_BUFS], gs, buf, j, src, dest; + char str[FTI_BUFS], lfn[FTI_BUFS], pfn[FTI_BUFS], jfn[FTI_BUFS], qfn[FTI_BUFS]; + char *blBuf1, *blBuf2, *blBuf3, *blBuf4; unsigned long ps, fs, maxFs, pos = 0; - FILE *lfd, *pfd, *jfd, *qfd; + FILE *lfd, *pfd, *jfd, *qfd; MPI_Request reqSend1, reqRecv1, reqSend2, reqRecv2; - MPI_Status status; + MPI_Status status; blBuf1 = talloc(char, FTI_Conf.blockSize); blBuf2 = talloc(char, FTI_Conf.blockSize); blBuf3 = talloc(char, FTI_Conf.blockSize); @@ -134,49 +197,87 @@ int FTI_RecoverL2(int group) { gs = FTI_Topo.groupSize; src = FTI_Topo.left; dest = FTI_Topo.right; - if (access(FTI_Ckpt[2].dir, F_OK) != 0) mkdir(FTI_Ckpt[2].dir, 0777); - if ( FTI_CheckErasures(&fs, &maxFs, group, erased, 2) != FTI_SCES) // Checking erasures - { FTI_Print("Error checking erasures.", FTI_DBUG); return FTI_NSCS; } - buf = -1; for(j = 0; j < gs; j++) if(erased[j] && erased[((j+1)%gs)+gs]) buf=j; // Counting erasures + if (access(FTI_Ckpt[2].dir, F_OK) != 0) + mkdir(FTI_Ckpt[2].dir, 0777); + if (FTI_CheckErasures(&fs, &maxFs, group, erased, 2) != FTI_SCES) // Checking erasures + { + FTI_Print("Error checking erasures.", FTI_DBUG); + return FTI_NSCS; + } + buf = -1; + for (j = 0; j < gs; j++) + if (erased[j] && erased[((j + 1) % gs) + gs]) + buf = j; // Counting erasures sprintf(str, "A checkpoint file and its partner copy (ID in group : %d) have been lost", buf); - if (buf > -1) { FTI_Print(str, FTI_DBUG); return FTI_NSCS; } - buf = 0; for(j = 0; j < gs*2; j++) if(erased[j]) buf++; // Counting erasures + if (buf > -1) { + FTI_Print(str, FTI_DBUG); + return FTI_NSCS; + } + buf = 0; + for (j = 0; j < gs * 2; j++) + if (erased[j]) + buf++; // Counting erasures if (buf > 0) { - ps = (maxFs/FTI_Conf.blockSize)*FTI_Conf.blockSize; pos = 0; // For the logic - if (ps < maxFs) ps = ps + FTI_Conf.blockSize; // Calculating padding size - sprintf(str,"File size: %ld, max. file size : %ld and padding size : %ld.", fs, maxFs, ps); + ps = (maxFs / FTI_Conf.blockSize) * FTI_Conf.blockSize; + pos = 0; // For the logic + if (ps < maxFs) + ps = ps + FTI_Conf.blockSize; // Calculating padding size + sprintf(str, "File size: %ld, max. file size : %ld and padding size : %ld.", fs, maxFs, ps); FTI_Print(str, FTI_DBUG); if (erased[FTI_Topo.groupRank]) { // Open checkpoint file to recover - sprintf(lfn,"%s/%s", FTI_Ckpt[2].dir, FTI_Exec.ckptFile); - sscanf(FTI_Exec.ckptFile,"Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &buf); - sprintf(jfn,"%s/Ckpt%d-Pcof%d.fti", FTI_Ckpt[2].dir, FTI_Exec.ckptID, buf); - sprintf(str,"Opening checkpoint file (%s) to recover (L2).", lfn); FTI_Print(str, FTI_DBUG); - sprintf(str,"Opening partner ckpt. file (%s) to recover (L2).", jfn); FTI_Print(str, FTI_DBUG); - lfd = fopen(lfn, "wb"); jfd = fopen(jfn, "wb"); - if (lfd == NULL) { FTI_Print("R2 cannot open the checkpoint file.", FTI_DBUG); return FTI_NSCS; } - if (jfd == NULL) { FTI_Print("R2 cannot open the partner ckpt. file.", FTI_DBUG); return FTI_NSCS; } + sprintf(lfn, "%s/%s", FTI_Ckpt[2].dir, FTI_Exec.ckptFile); + sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &buf); + sprintf(jfn, "%s/Ckpt%d-Pcof%d.fti", FTI_Ckpt[2].dir, FTI_Exec.ckptID, buf); + sprintf(str, "Opening checkpoint file (%s) to recover (L2).", lfn); + FTI_Print(str, FTI_DBUG); + sprintf(str, "Opening partner ckpt. file (%s) to recover (L2).", jfn); + FTI_Print(str, FTI_DBUG); + lfd = fopen(lfn, "wb"); + jfd = fopen(jfn, "wb"); + if (lfd == NULL) { + FTI_Print("R2 cannot open the checkpoint file.", FTI_DBUG); + return FTI_NSCS; + } + if (jfd == NULL) { + FTI_Print("R2 cannot open the partner ckpt. file.", FTI_DBUG); + return FTI_NSCS; + } } - if (erased[src] && !erased[gs+FTI_Topo.groupRank]) { // Truncate and open partner file to transfer - sscanf(FTI_Exec.ckptFile,"Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &buf); - sprintf(pfn,"%s/Ckpt%d-Pcof%d.fti", FTI_Ckpt[2].dir, FTI_Exec.ckptID, buf); - sprintf(str,"Opening partner ckpt. file (%s) to transfer (L2).", pfn); FTI_Print(str, FTI_DBUG); - if (truncate(pfn,ps) == -1) { FTI_Print("R2 cannot truncate the partner ckpt. file.", FTI_DBUG); return FTI_NSCS; } + if (erased[src] && !erased[gs + FTI_Topo.groupRank]) { // Truncate and open partner file to transfer + sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &buf); + sprintf(pfn, "%s/Ckpt%d-Pcof%d.fti", FTI_Ckpt[2].dir, FTI_Exec.ckptID, buf); + sprintf(str, "Opening partner ckpt. file (%s) to transfer (L2).", pfn); + FTI_Print(str, FTI_DBUG); + if (truncate(pfn, ps) == -1) { + FTI_Print("R2 cannot truncate the partner ckpt. file.", FTI_DBUG); + return FTI_NSCS; + } pfd = fopen(pfn, "rb"); - if (pfd == NULL) { FTI_Print("R2 cannot open partner ckpt. file.", FTI_DBUG); return FTI_NSCS; } + if (pfd == NULL) { + FTI_Print("R2 cannot open partner ckpt. file.", FTI_DBUG); + return FTI_NSCS; + } } - if (erased[dest] && !erased[gs+FTI_Topo.groupRank]) { // Truncate and open partner file to transfer - sprintf(qfn,"%s/%s", FTI_Ckpt[2].dir, FTI_Exec.ckptFile); - sprintf(str,"Opening ckpt. file (%s) to transfer (L2).", qfn); FTI_Print(str, FTI_DBUG); - if (truncate(qfn,ps) == -1) { FTI_Print("R2 cannot truncate the ckpt. file.", FTI_DBUG); return FTI_NSCS; } + if (erased[dest] && !erased[gs + FTI_Topo.groupRank]) { // Truncate and open partner file to transfer + sprintf(qfn, "%s/%s", FTI_Ckpt[2].dir, FTI_Exec.ckptFile); + sprintf(str, "Opening ckpt. file (%s) to transfer (L2).", qfn); + FTI_Print(str, FTI_DBUG); + if (truncate(qfn, ps) == -1) { + FTI_Print("R2 cannot truncate the ckpt. file.", FTI_DBUG); + return FTI_NSCS; + } qfd = fopen(qfn, "rb"); - if (qfd == NULL) { FTI_Print("R2 cannot open ckpt. file.", FTI_DBUG); return FTI_NSCS; } + if (qfd == NULL) { + FTI_Print("R2 cannot open ckpt. file.", FTI_DBUG); + return FTI_NSCS; + } } - while(pos < ps) { // Checkpoint files exchange - if (erased[src] && !erased[gs+FTI_Topo.groupRank]) { + while (pos < ps) { // Checkpoint files exchange + if (erased[src] && !erased[gs + FTI_Topo.groupRank]) { fread(blBuf1, sizeof(char), FTI_Conf.blockSize, pfd); MPI_Isend(blBuf1, FTI_Conf.blockSize, MPI_CHAR, src, FTI_Conf.tag, FTI_Exec.groupComm, &reqSend1); } - if (erased[dest] && !erased[gs+FTI_Topo.groupRank]) { + if (erased[dest] && !erased[gs + FTI_Topo.groupRank]) { fread(blBuf3, sizeof(char), FTI_Conf.blockSize, qfd); MPI_Isend(blBuf3, FTI_Conf.blockSize, MPI_CHAR, dest, FTI_Conf.tag, FTI_Exec.groupComm, &reqSend2); } @@ -184,38 +285,68 @@ int FTI_RecoverL2(int group) { MPI_Irecv(blBuf2, FTI_Conf.blockSize, MPI_CHAR, dest, FTI_Conf.tag, FTI_Exec.groupComm, &reqRecv1); MPI_Irecv(blBuf4, FTI_Conf.blockSize, MPI_CHAR, src, FTI_Conf.tag, FTI_Exec.groupComm, &reqRecv2); } - if (erased[src] && !erased[gs+FTI_Topo.groupRank]) MPI_Wait(&reqSend1, &status); - if (erased[dest] && !erased[gs+FTI_Topo.groupRank]) MPI_Wait(&reqSend2, &status); + if (erased[src] && !erased[gs + FTI_Topo.groupRank]) + MPI_Wait(&reqSend1, &status); + if (erased[dest] && !erased[gs + FTI_Topo.groupRank]) + MPI_Wait(&reqSend2, &status); if (erased[FTI_Topo.groupRank]) { MPI_Wait(&reqRecv1, &status); MPI_Wait(&reqRecv2, &status); - if (fwrite(blBuf2, sizeof(char), FTI_Conf.blockSize, lfd) != FTI_Conf.blockSize) - { FTI_Print("Errors writting the data in the R2 checkpoint file.", FTI_DBUG); return FTI_NSCS; } - if (fwrite(blBuf4, sizeof(char), FTI_Conf.blockSize, jfd) != FTI_Conf.blockSize) - { FTI_Print("Errors writting the data in the R2 partner ckpt. file.", FTI_DBUG); return FTI_NSCS; } + if (fwrite(blBuf2, sizeof(char), FTI_Conf.blockSize, lfd) != FTI_Conf.blockSize) { + FTI_Print("Errors writting the data in the R2 checkpoint file.", FTI_DBUG); + return FTI_NSCS; + } + if (fwrite(blBuf4, sizeof(char), FTI_Conf.blockSize, jfd) != FTI_Conf.blockSize) { + FTI_Print("Errors writting the data in the R2 partner ckpt. file.", FTI_DBUG); + return FTI_NSCS; + } } pos = pos + FTI_Conf.blockSize; } if (erased[FTI_Topo.groupRank]) { // Close files - if (fclose(lfd) != 0) { FTI_Print("R2 cannot close the checkpoint file.", FTI_DBUG); return FTI_NSCS; } - if (truncate(lfn,fs) == -1) { FTI_Print("R2 cannot re-truncate the checkpoint file.", FTI_DBUG); return FTI_NSCS; } - if (fclose(jfd) != 0) { FTI_Print("R2 cannot close the partner ckpt. file.", FTI_DBUG); return FTI_NSCS; } - if (truncate(jfn,fs) == -1) { FTI_Print("R2 cannot re-truncate the partner ckpt. file.", FTI_DBUG); return FTI_NSCS; } + if (fclose(lfd) != 0) { + FTI_Print("R2 cannot close the checkpoint file.", FTI_DBUG); + return FTI_NSCS; + } + if (truncate(lfn, fs) == -1) { + FTI_Print("R2 cannot re-truncate the checkpoint file.", FTI_DBUG); + return FTI_NSCS; + } + if (fclose(jfd) != 0) { + FTI_Print("R2 cannot close the partner ckpt. file.", FTI_DBUG); + return FTI_NSCS; + } + if (truncate(jfn, fs) == -1) { + FTI_Print("R2 cannot re-truncate the partner ckpt. file.", FTI_DBUG); + return FTI_NSCS; + } } - if (erased[src] && !erased[gs+FTI_Topo.groupRank]) { - if (fclose(pfd) != 0) { FTI_Print("R2 cannot close the partner ckpt. file", FTI_DBUG); return FTI_NSCS; } - if (truncate(pfn,fs) == -1) { FTI_Print("R2 cannot re-truncate the partner ckpt. file.", FTI_DBUG); return FTI_NSCS; } + if (erased[src] && !erased[gs + FTI_Topo.groupRank]) { + if (fclose(pfd) != 0) { + FTI_Print("R2 cannot close the partner ckpt. file", FTI_DBUG); + return FTI_NSCS; + } + if (truncate(pfn, fs) == -1) { + FTI_Print("R2 cannot re-truncate the partner ckpt. file.", FTI_DBUG); + return FTI_NSCS; + } } - if (erased[dest] && !erased[gs+FTI_Topo.groupRank]) { - if (fclose(qfd) != 0) { FTI_Print("R2 cannot close the ckpt. file", FTI_DBUG); return FTI_NSCS; } - if (truncate(qfn,fs) == -1) { FTI_Print("R2 cannot re-truncate the ckpt. file.", FTI_DBUG); return FTI_NSCS; } + if (erased[dest] && !erased[gs + FTI_Topo.groupRank]) { + if (fclose(qfd) != 0) { + FTI_Print("R2 cannot close the ckpt. file", FTI_DBUG); + return FTI_NSCS; + } + if (truncate(qfn, fs) == -1) { + FTI_Print("R2 cannot re-truncate the ckpt. file.", FTI_DBUG); + return FTI_NSCS; + } } } - free(blBuf1); free(blBuf2); // Free memory + free(blBuf1); + free(blBuf2); // Free memory return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief Recover L3 ckpt. files ordering the RS decoding algorithm. @@ -228,25 +359,41 @@ int FTI_RecoverL2(int group) { **/ /*-------------------------------------------------------------------------*/ -int FTI_RecoverL3(int group) { - int erased[FTI_BUFS], gs, j, l = 0; +int FTI_RecoverL3(int group) +{ + int erased[FTI_BUFS], gs, j, l = 0; unsigned long fs, maxFs; - char str[FTI_BUFS]; + char str[FTI_BUFS]; gs = FTI_Topo.groupSize; - if (access(FTI_Ckpt[3].dir, F_OK) != 0) mkdir(FTI_Ckpt[3].dir, 0777); - if ( FTI_CheckErasures(&fs, &maxFs, group, erased, 3) != FTI_SCES) // Checking erasures - { FTI_Print("Error checking erasures.", FTI_DBUG); return FTI_NSCS; } - l = 0; for(j = 0; j < gs; j++) { if(erased[j]) l++; if(erased[j+gs]) l++; } // Counting erasures - if (l > gs) { FTI_Print("Too many erasures at L3.", FTI_DBUG); return FTI_NSCS; } + if (access(FTI_Ckpt[3].dir, F_OK) != 0) + mkdir(FTI_Ckpt[3].dir, 0777); + if (FTI_CheckErasures(&fs, &maxFs, group, erased, 3) != FTI_SCES) // Checking erasures + { + FTI_Print("Error checking erasures.", FTI_DBUG); + return FTI_NSCS; + } + l = 0; + for (j = 0; j < gs; j++) { + if (erased[j]) + l++; + if (erased[j + gs]) + l++; + } // Counting erasures + if (l > gs) { + FTI_Print("Too many erasures at L3.", FTI_DBUG); + return FTI_NSCS; + } if (l > 0) { - sprintf(str, "There are %d encoded/checkpoint files missing in this group.", l); FTI_Print(str, FTI_DBUG); - if (FTI_Decode(fs, maxFs, erased) == FTI_NSCS) - { FTI_Print("RS-decoding could not regenerate the missing data.", FTI_DBUG); return FTI_NSCS; } + sprintf(str, "There are %d encoded/checkpoint files missing in this group.", l); + FTI_Print(str, FTI_DBUG); + if (FTI_Decode(fs, maxFs, erased) == FTI_NSCS) { + FTI_Print("RS-decoding could not regenerate the missing data.", FTI_DBUG); + return FTI_NSCS; + } } // Reed-Solomon decoding return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief Recover L4 ckpt. files from the PFS. @@ -259,50 +406,79 @@ int FTI_RecoverL3(int group) { **/ /*-------------------------------------------------------------------------*/ -int FTI_RecoverL4(int group) { +int FTI_RecoverL4(int group) +{ unsigned long maxFs, fs, ps, pos = 0; - int j, l, gs, erased[FTI_BUFS]; - char gfn[FTI_BUFS], lfn[FTI_BUFS], *blBuf1; - FILE *gfd, *lfd; + int j, l, gs, erased[FTI_BUFS]; + char gfn[FTI_BUFS], lfn[FTI_BUFS], *blBuf1; + FILE *gfd, *lfd; blBuf1 = talloc(char, FTI_Conf.blockSize); // Allocate memory gs = FTI_Topo.groupSize; - if (FTI_Topo.nodeRank == 0 || FTI_Topo.nodeRank == 1) - { - if (access(FTI_Ckpt[1].dir, F_OK) != 0) - { + if (FTI_Topo.nodeRank == 0 || FTI_Topo.nodeRank == 1) { + if (access(FTI_Ckpt[1].dir, F_OK) != 0) { FTI_Print("Directory L1 missing.", FTI_DBUG); - if (mkdir(FTI_Ckpt[1].dir, 0777) == 0) - { + if (mkdir(FTI_Ckpt[1].dir, 0777) == 0) { FTI_Print("Directory L1 created.", FTI_DBUG); - } else { + } + else { FTI_Print("Directory L1 could NOT be created.", FTI_WARN); } } } MPI_Barrier(FTI_COMM_WORLD); - if ( FTI_CheckErasures(&fs, &maxFs, group, erased, 4) != FTI_SCES) // Checking erasures - { FTI_Print("Error checking erasures.", FTI_DBUG); return FTI_NSCS; } - l = 0; for(j = 0; j < gs; j++) { if(erased[j]) l++; } // Counting erasures - if (l > 0) { FTI_Print("Checkpoint file missing at L4.", FTI_DBUG); return FTI_NSCS; } - ps = (fs/FTI_Conf.blockSize)*FTI_Conf.blockSize; pos = 0; // For the logic - if (ps < fs) ps = ps + FTI_Conf.blockSize; // Calculating padding size - sprintf(gfn,"%s/%s", FTI_Ckpt[4].dir, FTI_Exec.ckptFile); // Open and resize files - sprintf(lfn,"%s/%s", FTI_Ckpt[1].dir, FTI_Exec.ckptFile); - if (access(gfn, R_OK) != 0) { FTI_Print("R4 cannot read the checkpoint file in the PFS.", FTI_DBUG); return FTI_NSCS; } - if (truncate(gfn,ps) == -1) { FTI_Print("R4 cannot truncate the ckpt. file in the PFS.", FTI_DBUG); return FTI_NSCS; } - gfd = fopen(gfn, "rb"); lfd = fopen(lfn, "wb"); - if (gfd == NULL) { FTI_Print("R4 cannot open the ckpt. file in the PFS.", FTI_DBUG); return FTI_NSCS; } - if (lfd == NULL) { FTI_Print("R4 cannot open the local ckpt. file.", FTI_DBUG); return FTI_NSCS; } - while(pos < ps) { // Checkpoint files transfer from PFS + if (FTI_CheckErasures(&fs, &maxFs, group, erased, 4) != FTI_SCES) // Checking erasures + { + FTI_Print("Error checking erasures.", FTI_DBUG); + return FTI_NSCS; + } + l = 0; + for (j = 0; j < gs; j++) { + if (erased[j]) + l++; + } // Counting erasures + if (l > 0) { + FTI_Print("Checkpoint file missing at L4.", FTI_DBUG); + return FTI_NSCS; + } + ps = (fs / FTI_Conf.blockSize) * FTI_Conf.blockSize; + pos = 0; // For the logic + if (ps < fs) + ps = ps + FTI_Conf.blockSize; // Calculating padding size + sprintf(gfn, "%s/%s", FTI_Ckpt[4].dir, FTI_Exec.ckptFile); // Open and resize files + sprintf(lfn, "%s/%s", FTI_Ckpt[1].dir, FTI_Exec.ckptFile); + if (access(gfn, R_OK) != 0) { + FTI_Print("R4 cannot read the checkpoint file in the PFS.", FTI_DBUG); + return FTI_NSCS; + } + if (truncate(gfn, ps) == -1) { + FTI_Print("R4 cannot truncate the ckpt. file in the PFS.", FTI_DBUG); + return FTI_NSCS; + } + gfd = fopen(gfn, "rb"); + lfd = fopen(lfn, "wb"); + if (gfd == NULL) { + FTI_Print("R4 cannot open the ckpt. file in the PFS.", FTI_DBUG); + return FTI_NSCS; + } + if (lfd == NULL) { + FTI_Print("R4 cannot open the local ckpt. file.", FTI_DBUG); + return FTI_NSCS; + } + while (pos < ps) { // Checkpoint files transfer from PFS fread(blBuf1, sizeof(char), FTI_Conf.blockSize, gfd); fwrite(blBuf1, sizeof(char), FTI_Conf.blockSize, lfd); pos = pos + FTI_Conf.blockSize; } - fclose(gfd); fclose(lfd); // Close files - if (truncate(gfn,fs) == -1) { FTI_Print("R4 cannot re-truncate the checkpoint file in the PFS.", FTI_DBUG); return FTI_NSCS; } - if (truncate(lfn,fs) == -1) { FTI_Print("R4 cannot re-truncate the local checkpoint file.", FTI_DBUG); return FTI_NSCS; } + fclose(gfd); + fclose(lfd); // Close files + if (truncate(gfn, fs) == -1) { + FTI_Print("R4 cannot re-truncate the checkpoint file in the PFS.", FTI_DBUG); + return FTI_NSCS; + } + if (truncate(lfn, fs) == -1) { + FTI_Print("R4 cannot re-truncate the local checkpoint file.", FTI_DBUG); + return FTI_NSCS; + } free(blBuf1); return FTI_SCES; } - - diff --git a/src/recover.c b/src/recover.c index 1d1ec5ddc..3562d4e9b 100644 --- a/src/recover.c +++ b/src/recover.c @@ -5,10 +5,8 @@ * @brief Recovery functions for the FTI library. */ - #include "fti.h" - /*-------------------------------------------------------------------------*/ /** @brief Check if a file exist and that its size is 'correct'. @@ -21,27 +19,27 @@ **/ /*-------------------------------------------------------------------------*/ -int FTI_CheckFile(char *fn, unsigned long fs) { +int FTI_CheckFile(char* fn, unsigned long fs) +{ struct stat fileStatus; - if (access(fn, F_OK) == 0) - { - if (stat(fn, &fileStatus) == 0) - { - if (fileStatus.st_size == fs) - { + if (access(fn, F_OK) == 0) { + if (stat(fn, &fileStatus) == 0) { + if (fileStatus.st_size == fs) { return 0; - } else { + } + else { return 1; } - } else { + } + else { return 1; } - } else { + } + else { return 1; } } - /*-------------------------------------------------------------------------*/ /** @brief Detects all the erasures for a particular level. @@ -58,57 +56,56 @@ int FTI_CheckFile(char *fn, unsigned long fs) { **/ /*-------------------------------------------------------------------------*/ -int FTI_CheckErasures(unsigned long *fs, unsigned long *maxFs, int group, int *erased, int level) { - int buf; - char fn[FTI_BUFS]; - if (FTI_GetMeta(fs, maxFs, group, level) == FTI_SCES) - { +int FTI_CheckErasures(unsigned long* fs, unsigned long* maxFs, int group, int* erased, int level) +{ + int buf; + char fn[FTI_BUFS]; + if (FTI_GetMeta(fs, maxFs, group, level) == FTI_SCES) { FTI_Print("Metadata obtained.", FTI_DBUG); - } else { + } + else { FTI_Print("Error getting metadata.", FTI_WARN); return FTI_NSCS; } sprintf(fn, "Checking file %s and its erasures.", FTI_Exec.ckptFile); FTI_Print(fn, FTI_DBUG); - switch(level) - { - case 1: { - sprintf(fn, "%s/%s", FTI_Ckpt[1].dir, FTI_Exec.ckptFile); - buf = FTI_CheckFile(fn, *fs); - MPI_Allgather(&buf, 1, MPI_INT, erased, 1, MPI_INT, FTI_Exec.groupComm); - break; - } - case 2: { - sprintf(fn, "%s/%s", FTI_Ckpt[2].dir, FTI_Exec.ckptFile); - buf = FTI_CheckFile(fn, *fs); - MPI_Allgather(&buf, 1, MPI_INT, erased, 1, MPI_INT, FTI_Exec.groupComm); - sscanf(FTI_Exec.ckptFile,"Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &buf); - sprintf(fn,"%s/Ckpt%d-Pcof%d.fti", FTI_Ckpt[2].dir, FTI_Exec.ckptID, buf); - buf = FTI_CheckFile(fn, *fs); - MPI_Allgather(&buf, 1, MPI_INT, erased+FTI_Topo.groupSize, 1, MPI_INT, FTI_Exec.groupComm); - break; - } - case 3: { - sprintf(fn, "%s/%s", FTI_Ckpt[3].dir, FTI_Exec.ckptFile); - buf = FTI_CheckFile(fn, *fs); - MPI_Allgather(&buf, 1, MPI_INT, erased, 1, MPI_INT, FTI_Exec.groupComm); - sscanf(FTI_Exec.ckptFile,"Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &buf); - sprintf(fn,"%s/Ckpt%d-RSed%d.fti", FTI_Ckpt[3].dir, FTI_Exec.ckptID, buf); - buf = FTI_CheckFile(fn, *fs); - MPI_Allgather(&buf, 1, MPI_INT, erased+FTI_Topo.groupSize, 1, MPI_INT, FTI_Exec.groupComm); - break; - } - case 4: { - sprintf(fn, "%s/%s", FTI_Ckpt[4].dir, FTI_Exec.ckptFile); - buf = FTI_CheckFile(fn, *fs); - MPI_Allgather(&buf, 1, MPI_INT, erased, 1, MPI_INT, FTI_Exec.groupComm); - break; - } + switch (level) { + case 1: { + sprintf(fn, "%s/%s", FTI_Ckpt[1].dir, FTI_Exec.ckptFile); + buf = FTI_CheckFile(fn, *fs); + MPI_Allgather(&buf, 1, MPI_INT, erased, 1, MPI_INT, FTI_Exec.groupComm); + break; + } + case 2: { + sprintf(fn, "%s/%s", FTI_Ckpt[2].dir, FTI_Exec.ckptFile); + buf = FTI_CheckFile(fn, *fs); + MPI_Allgather(&buf, 1, MPI_INT, erased, 1, MPI_INT, FTI_Exec.groupComm); + sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &buf); + sprintf(fn, "%s/Ckpt%d-Pcof%d.fti", FTI_Ckpt[2].dir, FTI_Exec.ckptID, buf); + buf = FTI_CheckFile(fn, *fs); + MPI_Allgather(&buf, 1, MPI_INT, erased + FTI_Topo.groupSize, 1, MPI_INT, FTI_Exec.groupComm); + break; + } + case 3: { + sprintf(fn, "%s/%s", FTI_Ckpt[3].dir, FTI_Exec.ckptFile); + buf = FTI_CheckFile(fn, *fs); + MPI_Allgather(&buf, 1, MPI_INT, erased, 1, MPI_INT, FTI_Exec.groupComm); + sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &buf); + sprintf(fn, "%s/Ckpt%d-RSed%d.fti", FTI_Ckpt[3].dir, FTI_Exec.ckptID, buf); + buf = FTI_CheckFile(fn, *fs); + MPI_Allgather(&buf, 1, MPI_INT, erased + FTI_Topo.groupSize, 1, MPI_INT, FTI_Exec.groupComm); + break; + } + case 4: { + sprintf(fn, "%s/%s", FTI_Ckpt[4].dir, FTI_Exec.ckptFile); + buf = FTI_CheckFile(fn, *fs); + MPI_Allgather(&buf, 1, MPI_INT, erased, 1, MPI_INT, FTI_Exec.groupComm); + break; + } } return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief Decides wich action take depending on the restart level. @@ -120,53 +117,55 @@ int FTI_CheckErasures(unsigned long *fs, unsigned long *maxFs, int group, int *e **/ /*-------------------------------------------------------------------------*/ -int FTI_RecoverFiles() { - int f, r, tres = FTI_SCES, id, level = 1; +int FTI_RecoverFiles() +{ + int f, r, tres = FTI_SCES, id, level = 1; unsigned long fs, maxFs; - char str[FTI_BUFS]; - if (FTI_Topo.nbHeads == 1) - { + char str[FTI_BUFS]; + if (FTI_Topo.nbHeads == 1) { f = 1; - } else { + } + else { f = 0; } - if (!FTI_Topo.amIaHead) - { - while (level < 5) - { - if ((FTI_Exec.reco == 2) && (level != 4)) - { + if (!FTI_Topo.amIaHead) { + while (level < 5) { + if ((FTI_Exec.reco == 2) && (level != 4)) { tres = FTI_NSCS; - } else { - if (FTI_GetMeta(&fs, &maxFs, f, level) != FTI_SCES) - { + } + else { + if (FTI_GetMeta(&fs, &maxFs, f, level) != FTI_SCES) { tres = FTI_NSCS; - } else { - sscanf(FTI_Exec.ckptFile,"Ckpt%d-Rank%d.fti", &id, &r); + } + else { + sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &id, &r); sprintf(str, "Trying recovery with Ckpt. %d at level %d.", id, level); FTI_Print(str, FTI_DBUG); FTI_Exec.ckptID = id; FTI_Exec.ckptLvel = level; FTI_Exec.lastCkptLvel = FTI_Exec.ckptLvel; - if (FTI_Exec.ckptLvel == 4) - { + if (FTI_Exec.ckptLvel == 4) { FTI_Clean(1, FTI_Topo.groupID, FTI_Topo.myRank); MPI_Barrier(FTI_COMM_WORLD); } - if (FTI_Exec.ckptLvel == 4) r = FTI_RecoverL4(FTI_Topo.groupID); - if (FTI_Exec.ckptLvel == 3) r = FTI_RecoverL3(FTI_Topo.groupID); - if (FTI_Exec.ckptLvel == 2) r = FTI_RecoverL2(FTI_Topo.groupID); - if (FTI_Exec.ckptLvel == 1) r = FTI_RecoverL1(FTI_Topo.groupID); + if (FTI_Exec.ckptLvel == 4) + r = FTI_RecoverL4(FTI_Topo.groupID); + if (FTI_Exec.ckptLvel == 3) + r = FTI_RecoverL3(FTI_Topo.groupID); + if (FTI_Exec.ckptLvel == 2) + r = FTI_RecoverL2(FTI_Topo.groupID); + if (FTI_Exec.ckptLvel == 1) + r = FTI_RecoverL1(FTI_Topo.groupID); MPI_Allreduce(&r, &tres, 1, MPI_INT, MPI_SUM, FTI_COMM_WORLD); } } - if (tres == FTI_SCES) - { + if (tres == FTI_SCES) { sprintf(str, "Recovering successfully from level %d.", level); FTI_Print(str, FTI_INFO); break; - } else { + } + else { sprintf(str, "No possible to restart from level %d.", level); FTI_Print(str, FTI_INFO); level++; @@ -179,5 +178,3 @@ int FTI_RecoverFiles() { sleep(1); // Global barrier and sleep for clearer output return tres; } - - diff --git a/src/tools.c b/src/tools.c index 04afb90d4..8213cb0df 100644 --- a/src/tools.c +++ b/src/tools.c @@ -5,12 +5,10 @@ * @brief Utility functions for the FTI library. */ - #include "fti.h" #include #include - int FTI_Clean(int level, int group, int rank); /*-------------------------------------------------------------------------*/ @@ -27,35 +25,32 @@ int FTI_Clean(int level, int group, int rank); **/ /*-------------------------------------------------------------------------*/ -void FTI_Print(char *msg, int priority) { - if (priority >= FTI_Conf.verbosity) - { - if (msg != NULL) - { - switch(priority) - { - case FTI_EROR: - fprintf(stderr, "[FTI Error - %06d] : %s : %s \n", FTI_Topo.myRank, msg, strerror(errno)); - break; - case FTI_WARN: - fprintf(stdout, "[FTI Warning %06d] : %s \n", FTI_Topo.myRank, msg); - break; - case FTI_INFO: - if (FTI_Topo.splitRank == 0) - fprintf(stdout, "[ FTI Information ] : %s \n", msg); - break; - case FTI_DBUG: - fprintf(stdout, "[FTI Debug - %06d] : %s \n", FTI_Topo.myRank, msg); - break; - default: - break; +void FTI_Print(char* msg, int priority) +{ + if (priority >= FTI_Conf.verbosity) { + if (msg != NULL) { + switch (priority) { + case FTI_EROR: + fprintf(stderr, "[FTI Error - %06d] : %s : %s \n", FTI_Topo.myRank, msg, strerror(errno)); + break; + case FTI_WARN: + fprintf(stdout, "[FTI Warning %06d] : %s \n", FTI_Topo.myRank, msg); + break; + case FTI_INFO: + if (FTI_Topo.splitRank == 0) + fprintf(stdout, "[ FTI Information ] : %s \n", msg); + break; + case FTI_DBUG: + fprintf(stdout, "[FTI Debug - %06d] : %s \n", FTI_Topo.myRank, msg); + break; + default: + break; } } } fflush(stdout); } - /*-------------------------------------------------------------------------*/ /** @brief Receive the return code of a function and print a message. @@ -68,13 +63,14 @@ void FTI_Print(char *msg, int priority) { **/ /*-------------------------------------------------------------------------*/ -int FTI_Try(int result, char* message) { +int FTI_Try(int result, char* message) +{ char str[FTI_BUFS]; - if (result == FTI_SCES || result == FTI_DONE) - { + if (result == FTI_SCES || result == FTI_DONE) { sprintf(str, "FTI succeeded to %s", message); FTI_Print(str, FTI_DBUG); - } else { + } + else { sprintf(str, "FTI failed to %s", message); FTI_Print(str, FTI_WARN); sprintf(str, "Error => %s", strerror(errno)); @@ -83,7 +79,6 @@ int FTI_Try(int result, char* message) { return result; } - /*-------------------------------------------------------------------------*/ /** @brief It creates the basic datatypes and the dataset array. @@ -94,10 +89,10 @@ int FTI_Try(int result, char* message) { **/ /*-------------------------------------------------------------------------*/ -int FTI_InitBasicTypes(FTIT_dataset FTI_Data[FTI_BUFS]) { +int FTI_InitBasicTypes(FTIT_dataset FTI_Data[FTI_BUFS]) +{ int i; - for (i = 0; i < FTI_BUFS; i++) - { + for (i = 0; i < FTI_BUFS; i++) { FTI_Data[i].id = -1; } FTI_InitType(&FTI_CHAR, sizeof(char)); @@ -114,7 +109,6 @@ int FTI_InitBasicTypes(FTIT_dataset FTI_Data[FTI_BUFS]) { return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief It erases a directory and all its files. @@ -131,32 +125,31 @@ int FTI_InitBasicTypes(FTIT_dataset FTI_Data[FTI_BUFS]) { int FTI_RmDir(char path[FTI_BUFS], int flag) { - if (flag && (!access(path, R_OK))) - { + if (flag && (!access(path, R_OK))) { DIR* dp; char buf[FTI_BUFS], fn[FTI_BUFS], fil[FTI_BUFS]; struct dirent* ep; dp = opendir(path); sprintf(buf, "Removing directory %s and its files.", path); FTI_Print(buf, FTI_DBUG); - if (dp != NULL) - { - while(ep = readdir(dp)) - { + if (dp != NULL) { + while (ep = readdir(dp)) { sprintf(fil, "%s", ep->d_name); - if ((strcmp(fil, ".") != 0) && (strcmp(fil, "..") != 0)) - { + if ((strcmp(fil, ".") != 0) && (strcmp(fil, "..") != 0)) { sprintf(fn, "%s/%s", path, fil); sprintf(buf, "File %s will be removed.", fn); FTI_Print(buf, FTI_DBUG); - if (remove(fn) != 0) FTI_Print("Error removing target file.", FTI_EROR); + if (remove(fn) != 0) + FTI_Print("Error removing target file.", FTI_EROR); } } - } else { + } + else { FTI_Print("Error with opendir.", FTI_EROR); } closedir(dp); - if (remove(path) != 0) FTI_Print("Error removing target directory.", FTI_EROR); + if (remove(path) != 0) + FTI_Print("Error removing target directory.", FTI_EROR); } return FTI_SCES; } @@ -175,52 +168,44 @@ int FTI_RmDir(char path[FTI_BUFS], int flag) **/ /*-------------------------------------------------------------------------*/ -int FTI_Clean(int level, int group, int rank) { +int FTI_Clean(int level, int group, int rank) +{ char buf[FTI_BUFS]; int nodeFlag, globalFlag = !FTI_Topo.splitRank; - nodeFlag = (((!FTI_Topo.amIaHead) && (FTI_Topo.nodeRank == 0)) || (FTI_Topo.amIaHead))? 1 : 0; - if (level == 0) - { + nodeFlag = (((!FTI_Topo.amIaHead) && (FTI_Topo.nodeRank == 0)) || (FTI_Topo.amIaHead)) ? 1 : 0; + if (level == 0) { FTI_RmDir(FTI_Conf.mTmpDir, globalFlag); FTI_RmDir(FTI_Conf.gTmpDir, globalFlag); FTI_RmDir(FTI_Conf.lTmpDir, nodeFlag); } - if (level >= 1) - { // Clean last checkpoint level 1 - FTI_RmDir(FTI_Ckpt[1].metaDir, globalFlag); - FTI_RmDir(FTI_Ckpt[1].dir, nodeFlag); + if (level >= 1) { // Clean last checkpoint level 1 + FTI_RmDir(FTI_Ckpt[1].metaDir, globalFlag); + FTI_RmDir(FTI_Ckpt[1].dir, nodeFlag); } - if (level >= 2) - { // Clean last checkpoint level 2 - FTI_RmDir(FTI_Ckpt[2].metaDir, globalFlag); - FTI_RmDir(FTI_Ckpt[2].dir, nodeFlag); + if (level >= 2) { // Clean last checkpoint level 2 + FTI_RmDir(FTI_Ckpt[2].metaDir, globalFlag); + FTI_RmDir(FTI_Ckpt[2].dir, nodeFlag); } - if (level >= 3) - { // Clean last checkpoint level 3 - FTI_RmDir(FTI_Ckpt[3].metaDir, globalFlag); - FTI_RmDir(FTI_Ckpt[3].dir, nodeFlag); + if (level >= 3) { // Clean last checkpoint level 3 + FTI_RmDir(FTI_Ckpt[3].metaDir, globalFlag); + FTI_RmDir(FTI_Ckpt[3].dir, nodeFlag); } - if (level == 4 || level == 5) - { // Clean last checkpoint level 4 - FTI_RmDir(FTI_Ckpt[4].metaDir, globalFlag); - FTI_RmDir(FTI_Ckpt[4].dir, globalFlag); + if (level == 4 || level == 5) { // Clean last checkpoint level 4 + FTI_RmDir(FTI_Ckpt[4].metaDir, globalFlag); + FTI_RmDir(FTI_Ckpt[4].dir, globalFlag); rmdir(FTI_Conf.gTmpDir); } - if (level == 5) - { // If it is the very last cleaning and we DO NOT keep the last checkpoint + if (level == 5) { // If it is the very last cleaning and we DO NOT keep the last checkpoint rmdir(FTI_Conf.lTmpDir); rmdir(FTI_Conf.localDir); rmdir(FTI_Conf.glbalDir); - snprintf(buf, FTI_BUFS, "%s/Topology.fti",FTI_Conf.metadDir); + snprintf(buf, FTI_BUFS, "%s/Topology.fti", FTI_Conf.metadDir); remove(buf); rmdir(FTI_Conf.metadDir); } - if (level == 6) - { // If it is the very last cleaning and we DO keep the last checkpoint + if (level == 6) { // If it is the very last cleaning and we DO keep the last checkpoint rmdir(FTI_Conf.lTmpDir); rmdir(FTI_Conf.localDir); } return FTI_SCES; } - - diff --git a/src/topo.c b/src/topo.c index ba1d59240..7682d9a66 100644 --- a/src/topo.c +++ b/src/topo.c @@ -5,10 +5,8 @@ * @brief Topology functions for the FTI library. */ - #include "fti.h" - /*-------------------------------------------------------------------------*/ /** @brief Writes the topology in a file for recovery. @@ -21,45 +19,41 @@ **/ /*-------------------------------------------------------------------------*/ -int FTI_SaveTopo(char *nameList) { +int FTI_SaveTopo(char* nameList) +{ char mfn[FTI_BUFS], str[FTI_BUFS]; - dictionary *ini; + dictionary* ini; int i; sprintf(str, "Trying to load configuration file (%s) to create topology.", FTI_Conf.cfgFile); FTI_Print(str, FTI_DBUG); ini = iniparser_load(FTI_Conf.cfgFile); - if (ini == NULL) - { + if (ini == NULL) { FTI_Print("Iniparser cannot parse the configuration file.", FTI_WARN); return FTI_NSCS; } iniparser_set(ini, "topology", NULL); // Set topology section - for (i = 0; i < FTI_Topo.nbNodes; i++) - { // Write list of nodes - strncpy(mfn,nameList+(i*FTI_BUFS),FTI_BUFS); + for (i = 0; i < FTI_Topo.nbNodes; i++) { // Write list of nodes + strncpy(mfn, nameList + (i * FTI_BUFS), FTI_BUFS); sprintf(str, "topology:%d", i); iniparser_set(ini, str, mfn); } // Unset sections of the configuration file iniparser_unset(ini, "basic"); iniparser_unset(ini, "restart"); iniparser_unset(ini, "advanced"); - sprintf(mfn,"%s/Topology.fti", FTI_Conf.metadDir); - sprintf(str,"Creating topology file (%s)...", mfn); + sprintf(mfn, "%s/Topology.fti", FTI_Conf.metadDir); + sprintf(str, "Creating topology file (%s)...", mfn); FTI_Print(str, FTI_DBUG); - FILE *fd = fopen(mfn, "w"); - if (fd == NULL) - { + FILE* fd = fopen(mfn, "w"); + if (fd == NULL) { FTI_Print("Topology file could NOT be opened", FTI_WARN); return FTI_NSCS; } iniparser_dump_ini(ini, fd); // Write new topology - if (fflush(fd) != 0) - { + if (fflush(fd) != 0) { FTI_Print("Topology file could NOT be flushed.", FTI_WARN); return FTI_NSCS; } - if (fclose(fd) != 0) - { + if (fclose(fd) != 0) { FTI_Print("Topology file could NOT be closed.", FTI_WARN); return FTI_NSCS; } @@ -67,7 +61,6 @@ int FTI_SaveTopo(char *nameList) { return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief Reorder the nodes following the previous topology. @@ -81,40 +74,36 @@ int FTI_SaveTopo(char *nameList) { **/ /*-------------------------------------------------------------------------*/ -int FTI_ReorderNodes(int *nodeList, char *nameList) { +int FTI_ReorderNodes(int* nodeList, char* nameList) +{ char mfn[FTI_BUFS], str[FTI_BUFS], *tmp; int i, j, *nl, *old, *new; nl = talloc(int, FTI_Topo.nbProc); old = talloc(int, FTI_Topo.nbNodes); new = talloc(int, FTI_Topo.nbNodes); - for (i = 0; i < FTI_Topo.nbNodes; i++) - { + for (i = 0; i < FTI_Topo.nbNodes; i++) { old[i] = -1; new[i] = -1; } - sprintf(mfn,"%s/Topology.fti", FTI_Conf.metadDir); + sprintf(mfn, "%s/Topology.fti", FTI_Conf.metadDir); sprintf(str, "Loading FTI topology file (%s) to reorder nodes...", mfn); FTI_Print(str, FTI_DBUG); - if (access(mfn, F_OK) != 0) - { // Checking that the topology file exist + if (access(mfn, F_OK) != 0) { // Checking that the topology file exist FTI_Print("The topology file is NOT accessible.", FTI_WARN); return FTI_NSCS; } - dictionary *ini; + dictionary* ini; ini = iniparser_load(mfn); - if (ini == NULL) - { + if (ini == NULL) { FTI_Print("Iniparser could NOT parse the topology file.", FTI_WARN); return FTI_NSCS; } - for (i = 0; i < FTI_Topo.nbNodes; i++) - { // Get the old order of nodes + for (i = 0; i < FTI_Topo.nbNodes; i++) { // Get the old order of nodes sprintf(str, "Topology:%d", i); tmp = iniparser_getstring(ini, str, NULL); snprintf(str, FTI_BUFS, "%s", tmp); - for (j = 0; j < FTI_Topo.nbNodes; j++) - { // Search for same node in current nameList - if (strncmp(str,nameList+(j*FTI_BUFS),FTI_BUFS) == 0) // If found... + for (j = 0; j < FTI_Topo.nbNodes; j++) { // Search for same node in current nameList + if (strncmp(str, nameList + (j * FTI_BUFS), FTI_BUFS) == 0) // If found... { old[j] = i; new[i] = j; @@ -124,12 +113,9 @@ int FTI_ReorderNodes(int *nodeList, char *nameList) { } iniparser_freedict(ini); j = 0; - for (i = 0; i < FTI_Topo.nbNodes; i++) - { // Introducing missing nodes - if (new[i] == -1) - { // For each new node.. - while(old[j] != -1) - { // ..search for an old node not present in the new list... + for (i = 0; i < FTI_Topo.nbNodes; i++) { // Introducing missing nodes + if (new[i] == -1) { // For each new node.. + while (old[j] != -1) { // ..search for an old node not present in the new list... j++; } // .. and set matching IDs old[j] = i; @@ -137,15 +123,12 @@ int FTI_ReorderNodes(int *nodeList, char *nameList) { j++; } } - for (i = 0; i < FTI_Topo.nbProc; i++) - { // Copying nodeList in nl + for (i = 0; i < FTI_Topo.nbProc; i++) { // Copying nodeList in nl nl[i] = nodeList[i]; } - for (i = 0; i < FTI_Topo.nbNodes; i++) - { // Creating the new nodeList with the old order - for (j = 0; j < FTI_Topo.nodeSize; j++) - { - nodeList[(i*FTI_Topo.nodeSize)+j] = nl[(new[i]*FTI_Topo.nodeSize)+j]; + for (i = 0; i < FTI_Topo.nbNodes; i++) { // Creating the new nodeList with the old order + for (j = 0; j < FTI_Topo.nodeSize; j++) { + nodeList[(i * FTI_Topo.nodeSize) + j] = nl[(new[i] * FTI_Topo.nodeSize) + j]; } } // Free memory free(old); @@ -154,7 +137,6 @@ int FTI_ReorderNodes(int *nodeList, char *nameList) { return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief Build the list of nodes in the current execution. @@ -168,58 +150,54 @@ int FTI_ReorderNodes(int *nodeList, char *nameList) { **/ /*-------------------------------------------------------------------------*/ -int FTI_BuildNodeList(int *nodeList, char *nameList) { +int FTI_BuildNodeList(int* nodeList, char* nameList) +{ int i, found, pos, p, nbNodes = 0; char hname[FTI_BUFS], str[FTI_BUFS], *lhn; - lhn = talloc(char, FTI_BUFS*FTI_Topo.nbProc); - memset(lhn+(FTI_Topo.myRank*FTI_BUFS), 0, FTI_BUFS); // To get local hostname - if (!FTI_Conf.test) - { - gethostname(lhn+(FTI_Topo.myRank*FTI_BUFS),FTI_BUFS); // NOT local test - } else { - snprintf(lhn+(FTI_Topo.myRank*FTI_BUFS),FTI_BUFS,"node%d",FTI_Topo.myRank/FTI_Topo.nodeSize); // Local + lhn = talloc(char, FTI_BUFS* FTI_Topo.nbProc); + memset(lhn + (FTI_Topo.myRank * FTI_BUFS), 0, FTI_BUFS); // To get local hostname + if (!FTI_Conf.test) { + gethostname(lhn + (FTI_Topo.myRank * FTI_BUFS), FTI_BUFS); // NOT local test } - strncpy(hname,lhn+(FTI_Topo.myRank*FTI_BUFS),FTI_BUFS); // Distributing host names + else { + snprintf(lhn + (FTI_Topo.myRank * FTI_BUFS), FTI_BUFS, "node%d", FTI_Topo.myRank / FTI_Topo.nodeSize); // Local + } + strncpy(hname, lhn + (FTI_Topo.myRank * FTI_BUFS), FTI_BUFS); // Distributing host names MPI_Allgather(hname, FTI_BUFS, MPI_CHAR, lhn, FTI_BUFS, MPI_CHAR, FTI_Exec.globalComm); - for (i = 0; i < FTI_Topo.nbProc; i++) - { // Creating the node list: For each process + for (i = 0; i < FTI_Topo.nbProc; i++) { // Creating the node list: For each process found = 0; pos = 0; - strncpy(hname,lhn+(i*FTI_BUFS),FTI_BUFS); // Get node name of process i - while ((pos < nbNodes) && (found == 0)) - { // Search the node name in the current list of node names - if (strncmp(&(nameList[pos*FTI_BUFS]),hname,FTI_BUFS) == 0) - { // If we find it break out + strncpy(hname, lhn + (i * FTI_BUFS), FTI_BUFS); // Get node name of process i + while ((pos < nbNodes) && (found == 0)) { // Search the node name in the current list of node names + if (strncmp(&(nameList[pos * FTI_BUFS]), hname, FTI_BUFS) == 0) { // If we find it break out found = 1; - } else { // Else move to the next name in the list + } + else { // Else move to the next name in the list pos++; } } - if (found) - { // If we found the node name in the current list... - p = pos*FTI_Topo.nodeSize; - while (p < pos*FTI_Topo.nodeSize + FTI_Topo.nodeSize) - { // ... we look for empty spot in this node - if (nodeList[p] == -1) - { + if (found) { // If we found the node name in the current list... + p = pos * FTI_Topo.nodeSize; + while (p < pos * FTI_Topo.nodeSize + FTI_Topo.nodeSize) { // ... we look for empty spot in this node + if (nodeList[p] == -1) { nodeList[p] = i; break; - } else { + } + else { p++; } } - } else { // ... else, we add the new node to the end of the current list of nodes - strncpy(&(nameList[pos*FTI_BUFS]),hname,FTI_BUFS); - nodeList[pos*FTI_Topo.nodeSize] = i; + } + else { // ... else, we add the new node to the end of the current list of nodes + strncpy(&(nameList[pos * FTI_BUFS]), hname, FTI_BUFS); + nodeList[pos * FTI_Topo.nodeSize] = i; nbNodes++; } } - for (i = 0; i < FTI_Topo.nbProc; i++) - { // Checking that all nodes have nodeSize processes - if (nodeList[i] == -1) - { - sprintf(str, "Node %d has no %d processes", i/FTI_Topo.nodeSize, FTI_Topo.nodeSize); + for (i = 0; i < FTI_Topo.nbProc; i++) { // Checking that all nodes have nodeSize processes + if (nodeList[i] == -1) { + sprintf(str, "Node %d has no %d processes", i / FTI_Topo.nodeSize, FTI_Topo.nodeSize); FTI_Print(str, FTI_WARN); return FTI_NSCS; } @@ -228,7 +206,6 @@ int FTI_BuildNodeList(int *nodeList, char *nameList) { return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief Build the list of nodes in the current execution. @@ -243,51 +220,47 @@ int FTI_BuildNodeList(int *nodeList, char *nameList) { **/ /*-------------------------------------------------------------------------*/ -int FTI_CreateComms(int *userProcList, int *distProcList, int* nodeList) { +int FTI_CreateComms(int* userProcList, int* distProcList, int* nodeList) +{ MPI_Status status; char str[FTI_BUFS]; MPI_Group newGroup, origGroup; MPI_Comm_group(FTI_Exec.globalComm, &origGroup); int i, src, buf, group[FTI_BUFS]; // FTI_BUFS > Max. group size - if (FTI_Topo.amIaHead) - { - MPI_Group_incl(origGroup, FTI_Topo.nbNodes*FTI_Topo.nbHeads, distProcList, &newGroup); + if (FTI_Topo.amIaHead) { + MPI_Group_incl(origGroup, FTI_Topo.nbNodes * FTI_Topo.nbHeads, distProcList, &newGroup); MPI_Comm_create(FTI_Exec.globalComm, newGroup, &FTI_COMM_WORLD); - for (i = FTI_Topo.nbHeads; i < FTI_Topo.nodeSize; i++) - { - src = nodeList[(FTI_Topo.nodeID*FTI_Topo.nodeSize)+i]; + for (i = FTI_Topo.nbHeads; i < FTI_Topo.nodeSize; i++) { + src = nodeList[(FTI_Topo.nodeID * FTI_Topo.nodeSize) + i]; MPI_Recv(&buf, 1, MPI_INT, src, FTI_Conf.tag, FTI_Exec.globalComm, &status); - if (buf == src) - { - FTI_Topo.body[i-FTI_Topo.nbHeads] = src; + if (buf == src) { + FTI_Topo.body[i - FTI_Topo.nbHeads] = src; } } - } else { - MPI_Group_incl(origGroup, FTI_Topo.nbProc-(FTI_Topo.nbNodes*FTI_Topo.nbHeads), userProcList, &newGroup); + } + else { + MPI_Group_incl(origGroup, FTI_Topo.nbProc - (FTI_Topo.nbNodes * FTI_Topo.nbHeads), userProcList, &newGroup); MPI_Comm_create(FTI_Exec.globalComm, newGroup, &FTI_COMM_WORLD); - if (FTI_Topo.nbHeads == 1) - { + if (FTI_Topo.nbHeads == 1) { MPI_Send(&(FTI_Topo.myRank), 1, MPI_INT, FTI_Topo.headRank, FTI_Conf.tag, FTI_Exec.globalComm); } } MPI_Comm_rank(FTI_COMM_WORLD, &FTI_Topo.splitRank); - buf = FTI_Topo.sectorID*FTI_Topo.groupSize; - for (i = 0; i < FTI_Topo.groupSize; i++) - { // Group of node-distributed processes (Topology-aware). - group[i] = distProcList[buf+i]; + buf = FTI_Topo.sectorID * FTI_Topo.groupSize; + for (i = 0; i < FTI_Topo.groupSize; i++) { // Group of node-distributed processes (Topology-aware). + group[i] = distProcList[buf + i]; } MPI_Comm_group(FTI_Exec.globalComm, &origGroup); MPI_Group_incl(origGroup, FTI_Topo.groupSize, group, &newGroup); MPI_Comm_create(FTI_Exec.globalComm, newGroup, &FTI_Exec.groupComm); - MPI_Group_rank (newGroup, &(FTI_Topo.groupRank)); - FTI_Topo.right = (FTI_Topo.groupRank+1)%FTI_Topo.groupSize; - FTI_Topo.left = (FTI_Topo.groupRank+FTI_Topo.groupSize-1)%FTI_Topo.groupSize; + MPI_Group_rank(newGroup, &(FTI_Topo.groupRank)); + FTI_Topo.right = (FTI_Topo.groupRank + 1) % FTI_Topo.groupSize; + FTI_Topo.left = (FTI_Topo.groupRank + FTI_Topo.groupSize - 1) % FTI_Topo.groupSize; MPI_Group_free(&origGroup); MPI_Group_free(&newGroup); return FTI_SCES; } - /*-------------------------------------------------------------------------*/ /** @brief Builds and saves the topology of the current execution. @@ -299,69 +272,59 @@ int FTI_CreateComms(int *userProcList, int *distProcList, int* nodeList) { **/ /*-------------------------------------------------------------------------*/ -int FTI_Topology() { - int res, nn, found, c1=0, c2=0, p, i, mypos, posInNode; - char str[FTI_BUFS], *nameList = talloc(char, FTI_Topo.nbNodes * FTI_BUFS); - int *nodeList = talloc(int, FTI_Topo.nbNodes * FTI_Topo.nodeSize); - int *distProcList = talloc(int, FTI_Topo.nbNodes); - int *userProcList = talloc(int, FTI_Topo.nbProc-(FTI_Topo.nbNodes*FTI_Topo.nbHeads)); - for (i = 0; i < FTI_Topo.nbProc; i++) - { +int FTI_Topology() +{ + int res, nn, found, c1 = 0, c2 = 0, p, i, mypos, posInNode; + char str[FTI_BUFS], *nameList = talloc(char, FTI_Topo.nbNodes *FTI_BUFS); + int* nodeList = talloc(int, FTI_Topo.nbNodes* FTI_Topo.nodeSize); + int* distProcList = talloc(int, FTI_Topo.nbNodes); + int* userProcList = talloc(int, FTI_Topo.nbProc - (FTI_Topo.nbNodes * FTI_Topo.nbHeads)); + for (i = 0; i < FTI_Topo.nbProc; i++) { nodeList[i] = -1; } res = FTI_Try(FTI_BuildNodeList(nodeList, nameList), "create node list."); - if (res == FTI_NSCS) - { + if (res == FTI_NSCS) { return FTI_NSCS; } - if (FTI_Exec.reco > 0) - { + if (FTI_Exec.reco > 0) { res = FTI_Try(FTI_ReorderNodes(nodeList, nameList), "reorder nodes."); - if (res == FTI_NSCS) - { + if (res == FTI_NSCS) { return FTI_NSCS; } } MPI_Barrier(FTI_Exec.globalComm); // Need to synchronize before editing topology file - if (FTI_Topo.myRank == 0 && FTI_Exec.reco == 0) - { + if (FTI_Topo.myRank == 0 && FTI_Exec.reco == 0) { res = FTI_Try(FTI_SaveTopo(nameList), "save topology."); - if (res == FTI_NSCS) - { + if (res == FTI_NSCS) { return FTI_NSCS; } } - for (i = 0; i < FTI_Topo.nbProc; i++) - { - if (FTI_Topo.myRank == nodeList[i]) - { + for (i = 0; i < FTI_Topo.nbProc; i++) { + if (FTI_Topo.myRank == nodeList[i]) { mypos = i; } - if ((i % FTI_Topo.nodeSize != 0) || (FTI_Topo.nbHeads == 0)) - { + if ((i % FTI_Topo.nodeSize != 0) || (FTI_Topo.nbHeads == 0)) { userProcList[c2] = nodeList[i]; c2++; } } FTI_Topo.nodeRank = mypos % FTI_Topo.nodeSize; - if (FTI_Topo.nodeRank == 0 && FTI_Topo.nbHeads == 1) - { + if (FTI_Topo.nodeRank == 0 && FTI_Topo.nbHeads == 1) { FTI_Topo.amIaHead = 1; - } else { + } + else { FTI_Topo.amIaHead = 0; } - FTI_Topo.nodeID = mypos/FTI_Topo.nodeSize; - FTI_Topo.headRank = nodeList[(mypos/FTI_Topo.nodeSize)*FTI_Topo.nodeSize]; + FTI_Topo.nodeID = mypos / FTI_Topo.nodeSize; + FTI_Topo.headRank = nodeList[(mypos / FTI_Topo.nodeSize) * FTI_Topo.nodeSize]; FTI_Topo.sectorID = FTI_Topo.nodeID / FTI_Topo.groupSize; - posInNode = mypos%FTI_Topo.nodeSize; + posInNode = mypos % FTI_Topo.nodeSize; FTI_Topo.groupID = posInNode; - for (i = 0; i < FTI_Topo.nbNodes; i++) - { - distProcList[i] = nodeList[(FTI_Topo.nodeSize*i)+posInNode]; + for (i = 0; i < FTI_Topo.nbNodes; i++) { + distProcList[i] = nodeList[(FTI_Topo.nodeSize * i) + posInNode]; } res = FTI_Try(FTI_CreateComms(userProcList, distProcList, nodeList), "create communicators."); - if (res == FTI_NSCS) - { + if (res == FTI_NSCS) { return FTI_NSCS; } free(userProcList); @@ -370,6 +333,3 @@ int FTI_Topology() { free(nodeList); return FTI_SCES; } - - - From b28f6d48120415d5910be34cbea92a66371ad15d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Mon, 1 Feb 2016 21:50:00 +0100 Subject: [PATCH 03/93] Creating directory of dependencies --- CMakeLists.txt | 14 +++++++------- deps/CMakeLists.txt | 4 ++++ deps/iniparser/CMakeLists.txt | 7 +++++++ {src => deps/iniparser}/dictionary.c | 0 {include => deps/iniparser}/dictionary.h | 0 {src => deps/iniparser}/iniparser.c | 0 {include => deps/iniparser}/iniparser.h | 0 deps/jerasure/CMakeLists.txt | 7 +++++++ {src => deps/jerasure}/galois.c | 0 {include => deps/jerasure}/galois.h | 0 {src => deps/jerasure}/jerasure.c | 0 {include => deps/jerasure}/jerasure.h | 0 include/fti.h | 7 +++---- 13 files changed, 28 insertions(+), 11 deletions(-) create mode 100644 deps/CMakeLists.txt create mode 100644 deps/iniparser/CMakeLists.txt rename {src => deps/iniparser}/dictionary.c (100%) rename {include => deps/iniparser}/dictionary.h (100%) rename {src => deps/iniparser}/iniparser.c (100%) rename {include => deps/iniparser}/iniparser.h (100%) create mode 100644 deps/jerasure/CMakeLists.txt rename {src => deps/jerasure}/galois.c (100%) rename {include => deps/jerasure}/galois.h (100%) rename {src => deps/jerasure}/jerasure.c (100%) rename {include => deps/jerasure}/jerasure.h (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 196dbb1b3..bcf5b4966 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,6 +10,8 @@ include(BPP) include(FortranCInterface) find_package(MPI REQUIRED) +add_subdirectory(deps) + option(ENABLE_FORTRAN "Enables the generation of the Fortran wrapper for FTI" ON) include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/src" "${CMAKE_CURRENT_SOURCE_DIR}/include" ${MPI_Fortran_INCLUDE_PATH} ${MPI_C_INCLUDE_PATH}) @@ -18,10 +20,6 @@ set (LIBRARY_OUTPUT_PATH ${CMAKE_CURRENT_BINARY_DIR}/lib) set (CMAKE_Fortran_MODULE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include) set(SRC_FTI - src/galois.c - src/jerasure.c - src/dictionary.c - src/iniparser.c src/checkpoint.c src/postckpt.c src/recover.c @@ -35,14 +33,16 @@ set(SRC_FTI append_property(SOURCE ${SRC_FTI} PROPERTY COMPILE_FLAGS " ${MPI_C_COMPILE_FLAGS} ") add_library(fti.static STATIC ${SRC_FTI}) -target_link_libraries(fti.static ${MPI_C_LIBRARIES}) +target_link_libraries(fti.static ${MPI_C_LIBRARIES} iniparser jerasure) + add_library(fti.shared SHARED ${SRC_FTI}) -target_link_libraries(fti.shared ${MPI_C_LIBRARIES}) +target_link_libraries(fti.shared ${MPI_C_LIBRARIES} iniparser jerasure) + append_property(TARGET fti.static fti.shared PROPERTY LINK_FLAGS " ${MPI_C_LINK_FLAGS} ") set_property(TARGET fti.static fti.shared PROPERTY OUTPUT_NAME fti) install(TARGETS fti.static fti.shared DESTINATION lib) -install(FILES "include/fti.h" "include/dictionary.h" "include/galois.h" "include/iniparser.h" "include/jerasure.h" DESTINATION include) +install(FILES "include/fti.h" DESTINATION include) if ( ENABLE_FORTRAN ) bpp_preprocess(BPP_FTI_F90 diff --git a/deps/CMakeLists.txt b/deps/CMakeLists.txt new file mode 100644 index 000000000..f5068132d --- /dev/null +++ b/deps/CMakeLists.txt @@ -0,0 +1,4 @@ +cmake_minimum_required(VERSION 2.8) + +add_subdirectory(iniparser) +add_subdirectory(jerasure) diff --git a/deps/iniparser/CMakeLists.txt b/deps/iniparser/CMakeLists.txt new file mode 100644 index 000000000..1e13d3aae --- /dev/null +++ b/deps/iniparser/CMakeLists.txt @@ -0,0 +1,7 @@ +cmake_minimum_required(VERSION 2.8) + +set(SRC_iniparser + dictionary.h dictionary.c + iniparser.h iniparser.c) + +add_library(iniparser ${SRC_iniparser}) diff --git a/src/dictionary.c b/deps/iniparser/dictionary.c similarity index 100% rename from src/dictionary.c rename to deps/iniparser/dictionary.c diff --git a/include/dictionary.h b/deps/iniparser/dictionary.h similarity index 100% rename from include/dictionary.h rename to deps/iniparser/dictionary.h diff --git a/src/iniparser.c b/deps/iniparser/iniparser.c similarity index 100% rename from src/iniparser.c rename to deps/iniparser/iniparser.c diff --git a/include/iniparser.h b/deps/iniparser/iniparser.h similarity index 100% rename from include/iniparser.h rename to deps/iniparser/iniparser.h diff --git a/deps/jerasure/CMakeLists.txt b/deps/jerasure/CMakeLists.txt new file mode 100644 index 000000000..7c48d6daf --- /dev/null +++ b/deps/jerasure/CMakeLists.txt @@ -0,0 +1,7 @@ +cmake_minimum_required(VERSION 2.8) + +set(SRC_jerasure + galois.h galois.c + jerasure.h jerasure.c) + +add_library(jerasure ${SRC_jerasure}) diff --git a/src/galois.c b/deps/jerasure/galois.c similarity index 100% rename from src/galois.c rename to deps/jerasure/galois.c diff --git a/include/galois.h b/deps/jerasure/galois.h similarity index 100% rename from include/galois.h rename to deps/jerasure/galois.h diff --git a/src/jerasure.c b/deps/jerasure/jerasure.c similarity index 100% rename from src/jerasure.c rename to deps/jerasure/jerasure.c diff --git a/include/jerasure.h b/deps/jerasure/jerasure.h similarity index 100% rename from include/jerasure.h rename to deps/jerasure/jerasure.h diff --git a/include/fti.h b/include/fti.h index 2360b8b82..4d9f4bf40 100644 --- a/include/fti.h +++ b/include/fti.h @@ -18,10 +18,9 @@ #include "mpi.h" -#include "iniparser.h" -#include "galois.h" -#include "jerasure.h" - +#include "../deps/iniparser/iniparser.h" +#include "../deps/jerasure/galois.h" +#include "../deps/jerasure/jerasure.h" /*--------------------------------------------------------------------------- Defines From a5a16701804c504996b9a4aed7e101f87ad78e9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 2 Feb 2016 12:36:29 +0100 Subject: [PATCH 04/93] Adding examples to cmake build and moving fortran code to its own directory --- CMakeLists.txt | 86 ++++++++++++++++------------- examples/CMakeLists.txt | 47 ++++++++++++++++ src/{ => fortran}/ftif.c | 0 src/{ => fortran}/interface.F90.bpp | 0 4 files changed, 95 insertions(+), 38 deletions(-) create mode 100644 examples/CMakeLists.txt rename src/{ => fortran}/ftif.c (100%) rename src/{ => fortran}/interface.F90.bpp (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index bcf5b4966..34ea45c4d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,36 +1,34 @@ -cmake_minimum_required(VERSION 2.8) -list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeScripts") +cmake_minimum_required(VERSION 2.8 FATAL_ERROR) -if ( "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}" ) +if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") project(FTI C Fortran) endif() +option(ENABLE_FORTRAN "Enables the generation of the Fortran wrapper for FTI" ON) + +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeScripts") include(AppendProperty) include(BPP) include(FortranCInterface) + find_package(MPI REQUIRED) add_subdirectory(deps) -option(ENABLE_FORTRAN "Enables the generation of the Fortran wrapper for FTI" ON) +include_directories("${CMAKE_CURRENT_BINARY_DIR}" + "${CMAKE_CURRENT_SOURCE_DIR}/src" "${CMAKE_CURRENT_SOURCE_DIR}/include" + ${MPI_Fortran_INCLUDE_PATH} ${MPI_C_INCLUDE_PATH}) -include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/src" "${CMAKE_CURRENT_SOURCE_DIR}/include" ${MPI_Fortran_INCLUDE_PATH} ${MPI_C_INCLUDE_PATH}) - -set (LIBRARY_OUTPUT_PATH ${CMAKE_CURRENT_BINARY_DIR}/lib) -set (CMAKE_Fortran_MODULE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include) +set(LIBRARY_OUTPUT_PATH ${CMAKE_CURRENT_BINARY_DIR}/lib) +set(CMAKE_Fortran_MODULE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include) set(SRC_FTI - src/checkpoint.c - src/postckpt.c - src/recover.c - src/postreco.c - src/topo.c - src/conf.c - src/meta.c - src/tools.c - src/api.c -) -append_property(SOURCE ${SRC_FTI} PROPERTY COMPILE_FLAGS " ${MPI_C_COMPILE_FLAGS} ") + src/api.c src/checkpoint.c src/conf.c src/meta.c + src/postckpt.c src/postreco.c src/recover.c + src/tools.c src/topo.c) + +append_property(SOURCE ${SRC_FTI} + PROPERTY COMPILE_FLAGS "${MPI_C_COMPILE_FLAGS}") add_library(fti.static STATIC ${SRC_FTI}) target_link_libraries(fti.static ${MPI_C_LIBRARIES} iniparser jerasure) @@ -38,33 +36,45 @@ target_link_libraries(fti.static ${MPI_C_LIBRARIES} iniparser jerasure) add_library(fti.shared SHARED ${SRC_FTI}) target_link_libraries(fti.shared ${MPI_C_LIBRARIES} iniparser jerasure) -append_property(TARGET fti.static fti.shared PROPERTY LINK_FLAGS " ${MPI_C_LINK_FLAGS} ") -set_property(TARGET fti.static fti.shared PROPERTY OUTPUT_NAME fti) +append_property(TARGET fti.static fti.shared + PROPERTY LINK_FLAGS "${MPI_C_LINK_FLAGS}") +set_property(TARGET fti.static fti.shared + PROPERTY OUTPUT_NAME fti) -install(TARGETS fti.static fti.shared DESTINATION lib) -install(FILES "include/fti.h" DESTINATION include) +install(TARGETS fti.static fti.shared + DESTINATION lib) +install(FILES "include/fti.h" + DESTINATION include) -if ( ENABLE_FORTRAN ) +if(ENABLE_FORTRAN) bpp_preprocess(BPP_FTI_F90 - src/interface.F90.bpp - ) - set(SRC_FTI_F90 - ${BPP_FTI_F90} - src/ftif.c - ) - append_property(SOURCE ${SRC_FTI_F90} PROPERTY COMPILE_FLAGS " ${MPI_Fortran_COMPILE_FLAGS} ") + src/fortran/interface.F90.bpp) + set(SRC_FTI_F90 ${BPP_FTI_F90} + src/fortran/ftif.c) + append_property(SOURCE ${SRC_FTI_F90} + PROPERTY COMPILE_FLAGS "${MPI_Fortran_COMPILE_FLAGS}") add_library(fti_f90.static STATIC ${SRC_FTI_F90}) - target_link_libraries(fti_f90.static fti.static ${MPI_Fortran_LIBRARIES} ${MPI_C_LIBRARIES}) + target_link_libraries(fti_f90.static + fti.static ${MPI_Fortran_LIBRARIES} ${MPI_C_LIBRARIES}) + add_library(fti_f90.shared SHARED ${SRC_FTI_F90}) - target_link_libraries(fti_f90.shared fti.shared ${MPI_Fortran_LIBRARIES} ${MPI_C_LIBRARIES}) - append_property(TARGET fti_f90.static fti_f90.shared PROPERTY LINK_FLAGS " ${MPI_Fortran_LINK_FLAGS} ${MPI_C_LINK_FLAGS} ") - set_property(TARGET fti_f90.static fti_f90.shared PROPERTY OUTPUT_NAME fti_f90) + target_link_libraries(fti_f90.shared + fti.shared ${MPI_Fortran_LIBRARIES} ${MPI_C_LIBRARIES}) - install(TARGETS fti_f90.static fti_f90.shared DESTINATION lib) - install(FILES ${CMAKE_Fortran_MODULE_DIRECTORY}/fti.mod DESTINATION include) + append_property(TARGET fti_f90.static fti_f90.shared + PROPERTY LINK_FLAGS "${MPI_Fortran_LINK_FLAGS} ${MPI_C_LINK_FLAGS}") + set_property(TARGET fti_f90.static fti_f90.shared + PROPERTY OUTPUT_NAME fti_f90) + + install(TARGETS fti_f90.static fti_f90.shared + DESTINATION lib) + install(FILES ${CMAKE_Fortran_MODULE_DIRECTORY}/fti.mod + DESTINATION include) endif() -if ( NOT "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}" ) +if(NOT "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") set(FTI_INCLUDE_PATH "${CMAKE_CURRENT_BINARY_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/include" PARENT_SCOPE) endif() + +add_subdirectory(examples) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt new file mode 100644 index 000000000..c88132ef1 --- /dev/null +++ b/examples/CMakeLists.txt @@ -0,0 +1,47 @@ +cmake_minimum_required(VERSION 2.8) + +link_directories(${CMAKE_BINARY_DIR}/lib) + + +add_executable(hd.exe heatdis.c) +target_link_libraries(hd.exe fti ${MPI_C_LIBRARIES}) + +if(MPI_C_COMPILE_FLAGS) + set_target_properties(hd.exe PROPERTIES + COMPILE_FLAGS "${MPI_C_COMPILE_FLAGS}") +endif() + +if(MPI_C_LINK_FLAGS) + set_target_properties(hd.exe PROPERTIES + LINK_FLAGS "${MPI_C_LINK_FLAGS}") +endif() + + +add_executable(hd2.exe heatd2.c) +target_link_libraries(hd2.exe fti ${MPI_C_LIBRARIES}) + +if(MPI_C_COMPILE_FLAGS) + set_target_properties(hd2.exe PROPERTIES + COMPILE_FLAGS "${MPI_C_COMPILE_FLAGS}") +endif() + +if(MPI_C_LINK_FLAGS) + set_target_properties(hd2.exe PROPERTIES + LINK_FLAGS "${MPI_C_LINK_FLAGS}") +endif() + + +if(ENABLE_FORTRAN) + add_executable(hdf.exe fheatdis.f90) + target_link_libraries(hdf.exe fti_f90 fti ${MPI_Fortran_LIBRARIES}) + + if(MPI_Fortran_COMPILE_FLAGS) + set_target_properties(hdf.exe PROPERTIES + COMPILE_FLAGS "${MPI_Fortran_COMPILE_FLAGS}") + endif() + + if(MPI_Fortran_LINK_FLAGS) + set_target_properties(hdf.exe PROPERTIES + LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}") + endif() +endif() diff --git a/src/ftif.c b/src/fortran/ftif.c similarity index 100% rename from src/ftif.c rename to src/fortran/ftif.c diff --git a/src/interface.F90.bpp b/src/fortran/interface.F90.bpp similarity index 100% rename from src/interface.F90.bpp rename to src/fortran/interface.F90.bpp From 3202429ae09758c9b662f3bf2e492780e194c3fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 2 Feb 2016 14:59:29 +0100 Subject: [PATCH 05/93] Adding private header file --- deps/CMakeLists.txt | 2 - deps/iniparser/CMakeLists.txt | 2 - deps/jerasure/CMakeLists.txt | 2 - examples/CMakeLists.txt | 2 - include/fti.h | 137 +++++++++------------------------- src/api.c | 1 + src/checkpoint.c | 1 + src/conf.c | 1 + src/interface.h | 77 +++++++++++++++++++ src/meta.c | 1 + src/postckpt.c | 1 + src/postreco.c | 1 + src/recover.c | 1 + src/tools.c | 2 +- src/topo.c | 1 + 15 files changed, 122 insertions(+), 110 deletions(-) create mode 100644 src/interface.h diff --git a/deps/CMakeLists.txt b/deps/CMakeLists.txt index f5068132d..18bac0331 100644 --- a/deps/CMakeLists.txt +++ b/deps/CMakeLists.txt @@ -1,4 +1,2 @@ -cmake_minimum_required(VERSION 2.8) - add_subdirectory(iniparser) add_subdirectory(jerasure) diff --git a/deps/iniparser/CMakeLists.txt b/deps/iniparser/CMakeLists.txt index 1e13d3aae..f2bb84a4f 100644 --- a/deps/iniparser/CMakeLists.txt +++ b/deps/iniparser/CMakeLists.txt @@ -1,5 +1,3 @@ -cmake_minimum_required(VERSION 2.8) - set(SRC_iniparser dictionary.h dictionary.c iniparser.h iniparser.c) diff --git a/deps/jerasure/CMakeLists.txt b/deps/jerasure/CMakeLists.txt index 7c48d6daf..5cfd297c8 100644 --- a/deps/jerasure/CMakeLists.txt +++ b/deps/jerasure/CMakeLists.txt @@ -1,5 +1,3 @@ -cmake_minimum_required(VERSION 2.8) - set(SRC_jerasure galois.h galois.c jerasure.h jerasure.c) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index c88132ef1..c55480a6c 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,5 +1,3 @@ -cmake_minimum_required(VERSION 2.8) - link_directories(${CMAKE_BINARY_DIR}/lib) diff --git a/include/fti.h b/include/fti.h index 4d9f4bf40..15b0f3509 100644 --- a/include/fti.h +++ b/include/fti.h @@ -16,20 +16,12 @@ #include #include -#include "mpi.h" - -#include "../deps/iniparser/iniparser.h" -#include "../deps/jerasure/galois.h" -#include "../deps/jerasure/jerasure.h" +#include /*--------------------------------------------------------------------------- Defines ---------------------------------------------------------------------------*/ - -/** Malloc macro. */ -#define talloc(type, num) (type *) malloc(sizeof(type)*(num)) - /** Standard size of buffer and mas node size. */ #define FTI_BUFS 256 /** Word size used during RS encoding. */ @@ -73,14 +65,12 @@ extern "C"{ New types ---------------------------------------------------------------------------*/ -/*-------------------------------------------------------------------------*/ /** @typedef FTIT_double - @brief Double mapped as two integers to allow bit-wise operations. - - Double mapped as integer and byte array to allow bit-wise operators so - that we can inject failures on it. -*/ -/*-------------------------------------------------------------------------*/ + * @brief Double mapped as two integers to allow bit-wise operations. + * + * Double mapped as integer and byte array to allow bit-wise operators so + * that we can inject failures on it. + */ typedef union FTIT_double { /** Double mapped as a byte array. */ double value; /** Double floating point value. */ float floatval[2]; /** Float mapped to do bit edits. */ @@ -88,39 +78,33 @@ typedef union FTIT_double { /** Double mapped as a byte array. */ char byte[8]; /** Byte array for coarser control.*/ } FTIT_double; -/*-------------------------------------------------------------------------*/ /** @typedef FTIT_float - @brief Float mapped as integer to allow bit-wise operations. - - Float mapped as integer and byte array to allow bit-wise operators so - that we can inject failures on it. -*/ -/*-------------------------------------------------------------------------*/ + * @brief Float mapped as integer to allow bit-wise operations. + * + * Float mapped as integer and byte array to allow bit-wise operators so + * that we can inject failures on it. + */ typedef union FTIT_float { /** Float mapped as a byte array. */ float value; /** Floating point value. */ int intval; /** Integer mapped to do bit edits.*/ char byte[4]; /** Byte array for coarser control.*/ } FTIT_float; -/*-------------------------------------------------------------------------*/ /** @typedef FTIT_type - @brief Type recognized by FTI. - - This type allows handling data structures. -*/ -/*-------------------------------------------------------------------------*/ + * @brief Type recognized by FTI. + * + * This type allows handling data structures. + */ typedef struct FTIT_type { /** FTI type declarator. */ int id; /** ID of the data type. */ int size; /** Size of the data type. */ } FTIT_type; -/*-------------------------------------------------------------------------*/ /** @typedef FTIT_dataset - @brief Dataset metadata. - - This type stores the metadata related with a dataset. -*/ -/*-------------------------------------------------------------------------*/ + * @brief Dataset metadata. + * + * This type stores the metadata related with a dataset. + */ typedef struct FTIT_dataset { /** Dataset metadata. */ int id; /** ID to search/update dataset. */ void *ptr; /** Pointer to the dataset. */ @@ -130,13 +114,11 @@ typedef struct FTIT_dataset { /** Dataset metadata. */ long size; /** Total size of the dataset. */ } FTIT_dataset; -/*-------------------------------------------------------------------------*/ /** @typedef FTIT_execution - @brief Execution metadata - - This type stores all the dynamic metadata related to the current execution -*/ -/*-------------------------------------------------------------------------*/ + * @brief Execution metadata + * + * This type stores all the dynamic metadata related to the current execution + */ typedef struct FTIT_execution { /** Execution metadata. */ char id[FTI_BUFS]; /** Execution ID. */ char ckptFile[FTI_BUFS]; /** Checkpoint file name. */ @@ -164,13 +146,11 @@ typedef struct FTIT_execution { /** Execution metadata. */ MPI_Comm groupComm; /** Group communicator. */ } FTIT_execution; -/*-------------------------------------------------------------------------*/ /** @typedef FTIT_configuration - @brief Configuration metadata. - - This type stores the general configuration metadata. + * @brief Configuration metadata. + * + * This type stores the general configuration metadata. */ -/*-------------------------------------------------------------------------*/ typedef struct FTIT_configuration { /** Configuration metadata. */ char cfgFile[FTI_BUFS]; /** Configuration file name. */ int saveLastCkpt; /** TRUE to save last checkpoint. */ @@ -187,13 +167,11 @@ typedef struct FTIT_configuration { /** Configuration metadata. */ char mTmpDir[FTI_BUFS]; /** Metadata temporary directory. */ } FTIT_configuration; -/*-------------------------------------------------------------------------*/ /** @typedef FTIT_topology - @brief Topology metadata. - - This type stores the topology metadata. + * @brief Topology metadata. + * + * This type stores the topology metadata. */ -/*-------------------------------------------------------------------------*/ typedef struct FTIT_topology { /** Topology metadata. */ int nbProc; /** Total global number of proc. */ int nbNodes; /** Total global number of nodes. */ @@ -215,13 +193,11 @@ typedef struct FTIT_topology { /** Topology metadata. */ int body[FTI_BUFS]; /** List of app. proc. in the node.*/ } FTIT_topology; -/*-------------------------------------------------------------------------*/ /** @typedef FTIT_checkpoint - @brief Checkpoint metadata. - - This type stores all the checkpoint metadata. + * @brief Checkpoint metadata. + * + * This type stores all the checkpoint metadata. */ -/*-------------------------------------------------------------------------*/ typedef struct FTIT_checkpoint { /** Checkpoint metadata. */ char dir[FTI_BUFS]; /** Checkpoint directory. */ char metaDir[FTI_BUFS]; /** Metadata directory. */ @@ -229,13 +205,11 @@ typedef struct FTIT_checkpoint { /** Checkpoint metadata. */ int ckptIntv; /** Checkpoint interval. */ } FTIT_checkpoint; -/*-------------------------------------------------------------------------*/ /** @typedef FTIT_injection - @brief Type to describe failure injections in FTI. - - This type allows users to describe a SDC failure injection model. -*/ -/*-------------------------------------------------------------------------*/ + * @brief Type to describe failure injections in FTI. + * + * This type allows users to describe a SDC failure injection model. + */ typedef struct FTIT_injection { /** FTI type declarator. */ int rank; /** Rank of proc. that injects */ int index; /** Array index of the bit-flip. */ @@ -246,13 +220,10 @@ typedef struct FTIT_injection { /** FTI type declarator. */ double timer; /** Timer to measure frequency */ } FTIT_injection; - - /*--------------------------------------------------------------------------- Global variables ---------------------------------------------------------------------------*/ - /** MPI communicator that splits the global one into app and FTI appart. */ MPI_Comm FTI_COMM_WORLD; /** Topology of the system. */ @@ -287,12 +258,10 @@ FTIT_type FTI_DBLE; /** FTI data type for long doble floating point. */ FTIT_type FTI_LDBE; - /*--------------------------------------------------------------------------- FTI public functions ---------------------------------------------------------------------------*/ - int FTI_Init(char *configFile, MPI_Comm globalComm); int FTI_Status(); int FTI_InitType(FTIT_type *type, int size); @@ -303,42 +272,8 @@ int FTI_Recover(); int FTI_Snapshot(); int FTI_Finalize(); - -/*--------------------------------------------------------------------------- - FTI private functions ----------------------------------------------------------------------------*/ - - -void FTI_Print(char *msg, int priority); -int FTI_Try(int result, char* message); -int FTI_CheckErasures(unsigned long *fs, unsigned long *maxFs, int group, int *erased, int level); -int FTI_Clean(int level, int group, int rank); -int FTI_Local(int group); -int FTI_Ptner(int group); -int FTI_RSenc(int group); -int FTI_Flush(int group, int level); -int FTI_RecoverL1(int group); -int FTI_RecoverL2(int group); -int FTI_RecoverL3(int group); -int FTI_RecoverL4(int group); -int FTI_GetMeta(unsigned long *fs, unsigned long *mfs, int group, int level); -int FTI_CreateMetadata(int globalTmp); -int FTI_RmDir(char path[FTI_BUFS], int flag); -int FTI_UpdateIterTime(); -int FTI_PostCkpt(int group, int fo, int pr); -int FTI_WriteCkpt(FTIT_dataset* FTI_Data); -int FTI_Listen(); -int FTI_RecoverFiles(); -int FTI_UpdateConf(int restart); -int FTI_InitBasicTypes(FTIT_dataset FTI_Data[FTI_BUFS]); -int FTI_Topology(); -int FTI_LoadConf(FTIT_injection *FTI_Inje); - - - #ifdef __cplusplus } #endif #endif /* ----- #ifndef _FTI_H ----- */ - diff --git a/src/api.c b/src/api.c index 3d16b35ac..463181f78 100644 --- a/src/api.c +++ b/src/api.c @@ -6,6 +6,7 @@ */ #include "fti.h" +#include "interface.h" /** Array of datasets and all their internal information. */ static FTIT_dataset FTI_Data[FTI_BUFS]; diff --git a/src/checkpoint.c b/src/checkpoint.c index 6edd15976..61100f110 100644 --- a/src/checkpoint.c +++ b/src/checkpoint.c @@ -6,6 +6,7 @@ */ #include "fti.h" +#include "interface.h" /*-------------------------------------------------------------------------*/ /** diff --git a/src/conf.c b/src/conf.c index dcccd3a58..0337aa0c1 100644 --- a/src/conf.c +++ b/src/conf.c @@ -6,6 +6,7 @@ */ #include "fti.h" +#include "interface.h" /*-------------------------------------------------------------------------*/ /** diff --git a/src/interface.h b/src/interface.h new file mode 100644 index 000000000..1346ab976 --- /dev/null +++ b/src/interface.h @@ -0,0 +1,77 @@ +/** + * @file interface.h + * @author + * @date February, 2016 + * @brief Header file for the FTI library private functions. + */ + +#ifndef _FTI_INTERFACE_H +#define _FTI_INTERFACE_H + +#include "../deps/iniparser/iniparser.h" +#include "../deps/iniparser/dictionary.h" + +#include "../deps/jerasure/galois.h" +#include "../deps/jerasure/jerasure.h" + +/*--------------------------------------------------------------------------- + Defines +---------------------------------------------------------------------------*/ + +/** Malloc macro. */ +#define talloc(type, num) (type *)malloc(sizeof(type) * (num)) + +/*--------------------------------------------------------------------------- + FTI private functions +---------------------------------------------------------------------------*/ + +void FTI_Abort(); +int FTI_FloatBitFlip(float *target, int bit); +int FTI_DoubleBitFlip(double *target, int bit); + +int FTI_UpdateIterTime(); +int FTI_WriteCkpt(FTIT_dataset* FTI_Data); +int FTI_GroupClean(int level, int group, int pr); +int FTI_PostCkpt(int group, int fo, int pr); +int FTI_Listen(); + +int FTI_UpdateConf(int restart); +int FTI_ReadConf(FTIT_injection *FTI_Inje); +int FTI_TestConfig(); +int FTI_TestDirectories(); +int FTI_CreateDirs(); +int FTI_LoadConf(FTIT_injection *FTI_Inje); + +int FTI_GetMeta(unsigned long *fs, unsigned long *mfs, int group, int level); +int FTI_WriteMetadata(unsigned long *fs, unsigned long mfs, char* fnl); +int FTI_CreateMetadata(int globalTmp); + +int FTI_Local(int group); +int FTI_Ptner(int group); +int FTI_RSenc(int group); +int FTI_Flush(int group, int level); + +int FTI_Decode(int fs, int maxFs, int *erased); +int FTI_RecoverL1(int group); +int FTI_RecoverL2(int group); +int FTI_RecoverL3(int group); +int FTI_RecoverL4(int group); + +int FTI_CheckFile(char *fn, unsigned long fs); +int FTI_CheckErasures(unsigned long *fs, unsigned long *maxFs, int group, int *erased, int level); +int FTI_RecoverFiles(); + +int FTI_Clean(int level, int group, int rank); +void FTI_Print(char *msg, int priority); +int FTI_Try(int result, char* message); +int FTI_InitBasicTypes(FTIT_dataset FTI_Data[FTI_BUFS]); +int FTI_RmDir(char path[FTI_BUFS], int flag); +int FTI_Clean(int level, int group, int rank); + +int FTI_SaveTopo(char *nameList); +int FTI_ReorderNodes(int *nodeList, char *nameList); +int FTI_BuildNodeList(int *nodeList, char *nameList); +int FTI_CreateComms(int *userProcList, int *distProcList, int* nodeList); +int FTI_Topology(); + +#endif diff --git a/src/meta.c b/src/meta.c index d618e4260..1d16af052 100644 --- a/src/meta.c +++ b/src/meta.c @@ -6,6 +6,7 @@ */ #include "fti.h" +#include "interface.h" /*-------------------------------------------------------------------------*/ /** diff --git a/src/postckpt.c b/src/postckpt.c index 9656ef5a0..5c12a3964 100644 --- a/src/postckpt.c +++ b/src/postckpt.c @@ -6,6 +6,7 @@ */ #include "fti.h" +#include "interface.h" /*-------------------------------------------------------------------------*/ /** diff --git a/src/postreco.c b/src/postreco.c index b295c269b..77791b715 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -6,6 +6,7 @@ */ #include "fti.h" +#include "interface.h" /*-------------------------------------------------------------------------*/ /** diff --git a/src/recover.c b/src/recover.c index 3562d4e9b..cba2b3e58 100644 --- a/src/recover.c +++ b/src/recover.c @@ -6,6 +6,7 @@ */ #include "fti.h" +#include "interface.h" /*-------------------------------------------------------------------------*/ /** diff --git a/src/tools.c b/src/tools.c index 8213cb0df..b8cd0f7b9 100644 --- a/src/tools.c +++ b/src/tools.c @@ -6,8 +6,8 @@ */ #include "fti.h" +#include "interface.h" #include -#include int FTI_Clean(int level, int group, int rank); diff --git a/src/topo.c b/src/topo.c index 7682d9a66..924409476 100644 --- a/src/topo.c +++ b/src/topo.c @@ -6,6 +6,7 @@ */ #include "fti.h" +#include "interface.h" /*-------------------------------------------------------------------------*/ /** From 2973338cd5fdc8b441e503b7f64fd2dd02700ea5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 2 Feb 2016 16:13:29 +0100 Subject: [PATCH 06/93] Cleaning interface --- include/fti.h | 8 -------- src/api.c | 1 - src/checkpoint.c | 1 - src/conf.c | 1 - src/interface.h | 10 ++++++++++ src/meta.c | 1 - src/postckpt.c | 1 - src/postreco.c | 1 - src/recover.c | 1 - src/tools.c | 1 - src/topo.c | 1 - 11 files changed, 10 insertions(+), 17 deletions(-) diff --git a/include/fti.h b/include/fti.h index 15b0f3509..b95a32bea 100644 --- a/include/fti.h +++ b/include/fti.h @@ -8,14 +8,6 @@ #ifndef _FTI_H #define _FTI_H -#include -#include -#include -#include -#include -#include -#include - #include /*--------------------------------------------------------------------------- diff --git a/src/api.c b/src/api.c index 463181f78..1245a869f 100644 --- a/src/api.c +++ b/src/api.c @@ -5,7 +5,6 @@ * @brief API functions for the FTI library. */ -#include "fti.h" #include "interface.h" /** Array of datasets and all their internal information. */ diff --git a/src/checkpoint.c b/src/checkpoint.c index 61100f110..6ee75dbba 100644 --- a/src/checkpoint.c +++ b/src/checkpoint.c @@ -5,7 +5,6 @@ * @brief Checkpointing functions for the FTI library. */ -#include "fti.h" #include "interface.h" /*-------------------------------------------------------------------------*/ diff --git a/src/conf.c b/src/conf.c index 0337aa0c1..8565108a7 100644 --- a/src/conf.c +++ b/src/conf.c @@ -5,7 +5,6 @@ * @brief Configuration loading functions for the FTI library. */ -#include "fti.h" #include "interface.h" /*-------------------------------------------------------------------------*/ diff --git a/src/interface.h b/src/interface.h index 1346ab976..f19de8396 100644 --- a/src/interface.h +++ b/src/interface.h @@ -8,12 +8,22 @@ #ifndef _FTI_INTERFACE_H #define _FTI_INTERFACE_H +#include "fti.h" + #include "../deps/iniparser/iniparser.h" #include "../deps/iniparser/dictionary.h" #include "../deps/jerasure/galois.h" #include "../deps/jerasure/jerasure.h" +#include +#include +#include +#include +#include +#include +#include + /*--------------------------------------------------------------------------- Defines ---------------------------------------------------------------------------*/ diff --git a/src/meta.c b/src/meta.c index 1d16af052..06b8628f9 100644 --- a/src/meta.c +++ b/src/meta.c @@ -5,7 +5,6 @@ * @brief Metadata functions for the FTI library. */ -#include "fti.h" #include "interface.h" /*-------------------------------------------------------------------------*/ diff --git a/src/postckpt.c b/src/postckpt.c index 5c12a3964..9095aa9e3 100644 --- a/src/postckpt.c +++ b/src/postckpt.c @@ -5,7 +5,6 @@ * @brief Post-checkpointing functions for the FTI library. */ -#include "fti.h" #include "interface.h" /*-------------------------------------------------------------------------*/ diff --git a/src/postreco.c b/src/postreco.c index 77791b715..7c8f1f418 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -5,7 +5,6 @@ * @brief Post recovery functions for the FTI library. */ -#include "fti.h" #include "interface.h" /*-------------------------------------------------------------------------*/ diff --git a/src/recover.c b/src/recover.c index cba2b3e58..dcd1bbd19 100644 --- a/src/recover.c +++ b/src/recover.c @@ -5,7 +5,6 @@ * @brief Recovery functions for the FTI library. */ -#include "fti.h" #include "interface.h" /*-------------------------------------------------------------------------*/ diff --git a/src/tools.c b/src/tools.c index b8cd0f7b9..df9c12ac8 100644 --- a/src/tools.c +++ b/src/tools.c @@ -5,7 +5,6 @@ * @brief Utility functions for the FTI library. */ -#include "fti.h" #include "interface.h" #include diff --git a/src/topo.c b/src/topo.c index 924409476..b323fd050 100644 --- a/src/topo.c +++ b/src/topo.c @@ -5,7 +5,6 @@ * @brief Topology functions for the FTI library. */ -#include "fti.h" #include "interface.h" /*-------------------------------------------------------------------------*/ From e1ce5fd8cc4e0c931360cbaf9f6f50f814e1dd42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Wed, 3 Feb 2016 11:16:50 +0100 Subject: [PATCH 07/93] Adding -fPIC flag for dependancy libraries --- deps/iniparser/CMakeLists.txt | 3 +++ deps/jerasure/CMakeLists.txt | 3 +++ 2 files changed, 6 insertions(+) diff --git a/deps/iniparser/CMakeLists.txt b/deps/iniparser/CMakeLists.txt index f2bb84a4f..4de9fe09c 100644 --- a/deps/iniparser/CMakeLists.txt +++ b/deps/iniparser/CMakeLists.txt @@ -2,4 +2,7 @@ set(SRC_iniparser dictionary.h dictionary.c iniparser.h iniparser.c) +append_property(SOURCE ${SRC_iniparser} + PROPERTY COMPILE_FLAGS "-fPIC") + add_library(iniparser ${SRC_iniparser}) diff --git a/deps/jerasure/CMakeLists.txt b/deps/jerasure/CMakeLists.txt index 5cfd297c8..0e0c0df41 100644 --- a/deps/jerasure/CMakeLists.txt +++ b/deps/jerasure/CMakeLists.txt @@ -2,4 +2,7 @@ set(SRC_jerasure galois.h galois.c jerasure.h jerasure.c) +append_property(SOURCE ${SRC_jerasure} + PROPERTY COMPILE_FLAGS "-fPIC") + add_library(jerasure ${SRC_jerasure}) From 56b8777aaeaafe36f3ebc2489ab38ff9f1b5949d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Wed, 3 Feb 2016 11:28:33 +0100 Subject: [PATCH 08/93] Adding header file to the FTI fortran interface --- src/fortran/ftif.c | 8 ++++---- src/fortran/ftif.h | 15 +++++++++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) create mode 100644 src/fortran/ftif.h diff --git a/src/fortran/ftif.c b/src/fortran/ftif.c index c9b4ec3c1..ac4cc2fe4 100644 --- a/src/fortran/ftif.c +++ b/src/fortran/ftif.c @@ -6,9 +6,9 @@ * @date 2013-08-01 */ -#include - -#include +#include "fti.h" +#include "interface.h" +#include "ftif.h" /** @brief Fortran wrapper for FTI_Init, Initializes FTI. * @@ -59,4 +59,4 @@ int FTI_InitType_wrapper(FTIT_type** type, int size) int FTI_Protect_wrapper(int id, void* ptr, long count, FTIT_type* type) { return FTI_Protect(id, ptr, count, *type); -} \ No newline at end of file +} diff --git a/src/fortran/ftif.h b/src/fortran/ftif.h new file mode 100644 index 000000000..a0e65b2f1 --- /dev/null +++ b/src/fortran/ftif.h @@ -0,0 +1,15 @@ +/** + * @file ftif.h + * @author + * @date February, 2016 + * @brief Header file for the FTI Fortran interface. + */ + +#ifndef _FTIF_H +#define _FTIF_H + +int FTI_Init_fort_wrapper(char* configFile, int* globalComm); +int FTI_InitType_wrapper(FTIT_type** type, int size); +int FTI_Protect_wrapper(int id, void* ptr, long count, FTIT_type* type); + +#endif From c5bc7a7183eac4c5bc0245462ded07a33eec470c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Mon, 1 Feb 2016 15:59:23 +0100 Subject: [PATCH 09/93] fixes #6 --- src/topo.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/topo.c b/src/topo.c index b323fd050..64614f3a3 100644 --- a/src/topo.c +++ b/src/topo.c @@ -277,18 +277,20 @@ int FTI_Topology() int res, nn, found, c1 = 0, c2 = 0, p, i, mypos, posInNode; char str[FTI_BUFS], *nameList = talloc(char, FTI_Topo.nbNodes *FTI_BUFS); int* nodeList = talloc(int, FTI_Topo.nbNodes* FTI_Topo.nodeSize); - int* distProcList = talloc(int, FTI_Topo.nbNodes); - int* userProcList = talloc(int, FTI_Topo.nbProc - (FTI_Topo.nbNodes * FTI_Topo.nbHeads)); for (i = 0; i < FTI_Topo.nbProc; i++) { nodeList[i] = -1; } res = FTI_Try(FTI_BuildNodeList(nodeList, nameList), "create node list."); if (res == FTI_NSCS) { + free(nameList); + free(nodeList); return FTI_NSCS; } if (FTI_Exec.reco > 0) { res = FTI_Try(FTI_ReorderNodes(nodeList, nameList), "reorder nodes."); if (res == FTI_NSCS) { + free(nameList); + free(nodeList); return FTI_NSCS; } } @@ -296,9 +298,15 @@ int FTI_Topology() if (FTI_Topo.myRank == 0 && FTI_Exec.reco == 0) { res = FTI_Try(FTI_SaveTopo(nameList), "save topology."); if (res == FTI_NSCS) { + free(nameList); + free(nodeList); return FTI_NSCS; } } + + int *distProcList = talloc(int, FTI_Topo.nbNodes); + int *userProcList = talloc(int, FTI_Topo.nbProc - (FTI_Topo.nbNodes * FTI_Topo.nbHeads)); + for (i = 0; i < FTI_Topo.nbProc; i++) { if (FTI_Topo.myRank == nodeList[i]) { mypos = i; @@ -325,6 +333,10 @@ int FTI_Topology() } res = FTI_Try(FTI_CreateComms(userProcList, distProcList, nodeList), "create communicators."); if (res == FTI_NSCS) { + free(userProcList); + free(distProcList); + free(nameList); + free(nodeList); return FTI_NSCS; } free(userProcList); From 9261c7098d9218f33b6cb94c4707737c6b078df8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Mon, 1 Feb 2016 16:09:25 +0100 Subject: [PATCH 10/93] fixes #7 --- src/postckpt.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/postckpt.c b/src/postckpt.c index 9095aa9e3..bcbdcf694 100644 --- a/src/postckpt.c +++ b/src/postckpt.c @@ -230,8 +230,8 @@ int FTI_RSenc(int group) /*-------------------------------------------------------------------------*/ int FTI_Flush(int group, int level) { - char lfn[FTI_BUFS], gfn[FTI_BUFS], str[FTI_BUFS], *blBuf1 = talloc(char, FTI_Conf.blockSize); - unsigned long maxFs, fs, ps, pos = 0, bSize = FTI_Conf.blockSize; + char lfn[FTI_BUFS], gfn[FTI_BUFS], str[FTI_BUFS]; + unsigned long maxFs, fs, ps, pos = 0; FILE *lfd, *gfd; if (level == -1) return FTI_SCES; // Fake call for inline PFS checkpoint @@ -278,6 +278,10 @@ int FTI_Flush(int group, int level) FTI_Print("L4 cannot open ckpt. file in the PFS.", FTI_EROR); return FTI_NSCS; } + + char *blBuf1 = talloc(char, FTI_Conf.blockSize); + unsigned long bSize = FTI_Conf.blockSize; + while (pos < ps) { // Checkpoint files exchange if ((fs - pos) < FTI_Conf.blockSize) bSize = fs - pos; @@ -285,6 +289,9 @@ int FTI_Flush(int group, int level) fwrite(blBuf1, sizeof(char), bSize, gfd); pos = pos + FTI_Conf.blockSize; } + + free(blBuf1); + fclose(lfd); fclose(gfd); return FTI_SCES; From 811eabf51f09478fa766add302b5fac3190681e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Mon, 1 Feb 2016 16:15:51 +0100 Subject: [PATCH 11/93] fixes #9 --- src/postreco.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/postreco.c b/src/postreco.c index 7c8f1f418..c10380ac1 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -410,9 +410,9 @@ int FTI_RecoverL4(int group) { unsigned long maxFs, fs, ps, pos = 0; int j, l, gs, erased[FTI_BUFS]; - char gfn[FTI_BUFS], lfn[FTI_BUFS], *blBuf1; + char gfn[FTI_BUFS], lfn[FTI_BUFS]; FILE *gfd, *lfd; - blBuf1 = talloc(char, FTI_Conf.blockSize); // Allocate memory + gs = FTI_Topo.groupSize; if (FTI_Topo.nodeRank == 0 || FTI_Topo.nodeRank == 1) { if (access(FTI_Ckpt[1].dir, F_OK) != 0) { @@ -464,13 +464,18 @@ int FTI_RecoverL4(int group) FTI_Print("R4 cannot open the local ckpt. file.", FTI_DBUG); return FTI_NSCS; } + + char *blBuf1 = talloc(char, FTI_Conf.blockSize); while (pos < ps) { // Checkpoint files transfer from PFS fread(blBuf1, sizeof(char), FTI_Conf.blockSize, gfd); fwrite(blBuf1, sizeof(char), FTI_Conf.blockSize, lfd); pos = pos + FTI_Conf.blockSize; } + free(blBuf1); + fclose(gfd); fclose(lfd); // Close files + if (truncate(gfn, fs) == -1) { FTI_Print("R4 cannot re-truncate the checkpoint file in the PFS.", FTI_DBUG); return FTI_NSCS; @@ -479,6 +484,6 @@ int FTI_RecoverL4(int group) FTI_Print("R4 cannot re-truncate the local checkpoint file.", FTI_DBUG); return FTI_NSCS; } - free(blBuf1); + return FTI_SCES; } From 7d503f9c7289e9537ba373ba2f368f2928732c1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Mon, 1 Feb 2016 16:46:31 +0100 Subject: [PATCH 12/93] applying clang-format webkit style to include and src files --- include/fti.h | 54 +++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/include/fti.h b/include/fti.h index b95a32bea..f7c0f38af 100644 --- a/include/fti.h +++ b/include/fti.h @@ -5,8 +5,8 @@ * @brief Header file for the FTI library. */ -#ifndef _FTI_H -#define _FTI_H +#ifndef _FTI_H +#define _FTI_H #include @@ -15,42 +15,42 @@ ---------------------------------------------------------------------------*/ /** Standard size of buffer and mas node size. */ -#define FTI_BUFS 256 +#define FTI_BUFS 256 /** Word size used during RS encoding. */ -#define FTI_WORD 16 +#define FTI_WORD 16 /** Token returned when FTI performs a checkpoint. */ -#define FTI_DONE 1 +#define FTI_DONE 1 /** Token returned if a FTI function succeeds. */ -#define FTI_SCES 0 +#define FTI_SCES 0 /** Token returned if a FTI function fails. */ -#define FTI_NSCS -1 +#define FTI_NSCS -1 /** Verbosity level to print only errors. */ -#define FTI_EROR 4 +#define FTI_EROR 4 /** Verbosity level to print only warning and errors. */ -#define FTI_WARN 3 +#define FTI_WARN 3 /** Verbosity level to print main information. */ -#define FTI_INFO 2 +#define FTI_INFO 2 /** Verbosity level to print debug messages. */ -#define FTI_DBUG 1 +#define FTI_DBUG 1 /** Token for checkpoint Baseline. */ -#define FTI_BASE 990 +#define FTI_BASE 990 /** Token for checkpoint Level 1. */ -#define FTI_CKTW 991 +#define FTI_CKTW 991 /** Token for checkpoint Level 2. */ -#define FTI_XORW 992 +#define FTI_XORW 992 /** Token for checkpoint Level 3. */ -#define FTI_RSEW 993 +#define FTI_RSEW 993 /** Token for checkpoint Level 4. */ -#define FTI_PFSW 994 +#define FTI_PFSW 994 /** Token for end of the execution. */ -#define FTI_ENDW 995 +#define FTI_ENDW 995 /** Token to reject checkpoint. */ -#define FTI_REJW 996 +#define FTI_REJW 996 #ifdef __cplusplus -extern "C"{ +extern "C" { #endif /*--------------------------------------------------------------------------- @@ -217,15 +217,15 @@ typedef struct FTIT_injection { /** FTI type declarator. */ ---------------------------------------------------------------------------*/ /** MPI communicator that splits the global one into app and FTI appart. */ -MPI_Comm FTI_COMM_WORLD; +MPI_Comm FTI_COMM_WORLD; /** Topology of the system. */ -FTIT_topology FTI_Topo; +FTIT_topology FTI_Topo; /** Dynamic information for this execution. */ -FTIT_execution FTI_Exec; +FTIT_execution FTI_Exec; /** Checkpoint information for all levels of checkpoint. */ -FTIT_checkpoint FTI_Ckpt[5]; +FTIT_checkpoint FTI_Ckpt[5]; /** General configuration information used by FTI. */ -FTIT_configuration FTI_Conf; +FTIT_configuration FTI_Conf; /** FTI data type for chars. */ FTIT_type FTI_CHAR; @@ -256,8 +256,8 @@ FTIT_type FTI_LDBE; int FTI_Init(char *configFile, MPI_Comm globalComm); int FTI_Status(); -int FTI_InitType(FTIT_type *type, int size); -int FTI_Protect(int id, void *ptr, long count, FTIT_type type); +int FTI_InitType(FTIT_type* type, int size); +int FTI_Protect(int id, void* ptr, long count, FTIT_type type); int FTI_BitFlip(int datasetID); int FTI_Checkpoint(int id, int level); int FTI_Recover(); @@ -268,4 +268,4 @@ int FTI_Finalize(); } #endif -#endif /* ----- #ifndef _FTI_H ----- */ +#endif /* ----- #ifndef _FTI_H ----- */ From fbbe241125891f2ff4a8d1aa0cd475f42a54bc0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Mon, 1 Feb 2016 16:59:17 +0100 Subject: [PATCH 13/93] fixes #13 --- src/checkpoint.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/checkpoint.c b/src/checkpoint.c index 6ee75dbba..1fca8dd2c 100644 --- a/src/checkpoint.c +++ b/src/checkpoint.c @@ -90,6 +90,7 @@ int FTI_WriteCkpt(FTIT_dataset* FTI_Data) if (fwrite(FTI_Data[i].ptr, FTI_Data[i].eleSize, FTI_Data[i].count, fd) != FTI_Data[i].count) { sprintf(str, "Dataset #%d could not be written.", FTI_Data[i].id); FTI_Print(str, FTI_EROR); + fclose(fd); return FTI_NSCS; } } From 3d4d423222989f3f5c6f05dc5ac46ecf9a2b637a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Mon, 1 Feb 2016 17:02:40 +0100 Subject: [PATCH 14/93] fixes #14 --- src/postckpt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/postckpt.c b/src/postckpt.c index bcbdcf694..e1bb516c6 100644 --- a/src/postckpt.c +++ b/src/postckpt.c @@ -71,15 +71,18 @@ int FTI_Ptner(int group) src = FTI_Topo.left; lfd = fopen(lfn, "rb"); - pfd = fopen(pfn, "wb"); if (lfd == NULL) { FTI_Print("FTI failed to open L2 chckpt. file.", FTI_DBUG); return FTI_NSCS; } + + pfd = fopen(pfn, "wb"); if (pfd == NULL) { FTI_Print("FTI failed to open L2 partner file.", FTI_DBUG); + fclose(lfd); return FTI_NSCS; } + blBuf1 = talloc(char, FTI_Conf.blockSize); blBuf2 = talloc(char, FTI_Conf.blockSize); while (pos < ps) { // Checkpoint files partner copy From a5f286765c4722338e7a6777593424587b9ae815 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Mon, 1 Feb 2016 17:04:32 +0100 Subject: [PATCH 15/93] fixes #15 --- src/postckpt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/postckpt.c b/src/postckpt.c index e1bb516c6..b9bec1d09 100644 --- a/src/postckpt.c +++ b/src/postckpt.c @@ -143,13 +143,15 @@ int FTI_RSenc(int group) return FTI_NSCS; lfd = fopen(lfn, "rb"); - efd = fopen(efn, "wb"); if (lfd == NULL) { FTI_Print("FTI failed to open L3 checkpoint file.", FTI_EROR); return FTI_NSCS; } + + efd = fopen(efn, "wb"); if (efd == NULL) { FTI_Print("FTI failed to open encoded ckpt. file.", FTI_EROR); + fclose(lfd); return FTI_NSCS; } From 950ce1ed63846221a3f773c4590b6be6faa50271 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Mon, 1 Feb 2016 17:06:12 +0100 Subject: [PATCH 16/93] fixes #16 --- src/postckpt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/postckpt.c b/src/postckpt.c index b9bec1d09..a93564209 100644 --- a/src/postckpt.c +++ b/src/postckpt.c @@ -273,14 +273,17 @@ int FTI_Flush(int group, int level) FTI_Print("L4 cannot access the checkpoint file.", FTI_EROR); return FTI_NSCS; } + lfd = fopen(lfn, "rb"); if (lfd == NULL) { FTI_Print("L4 cannot open the checkpoint file.", FTI_EROR); return FTI_NSCS; } + gfd = fopen(gfn, "wb"); if (gfd == NULL) { FTI_Print("L4 cannot open ckpt. file in the PFS.", FTI_EROR); + fclose(lfd); return FTI_NSCS; } From 5f3d2fc287c7993d993e2a2aec0fc42954b406c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Mon, 1 Feb 2016 17:15:42 +0100 Subject: [PATCH 17/93] fixes #17 --- src/postreco.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/postreco.c b/src/postreco.c index c10380ac1..a54513357 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -85,10 +85,13 @@ int FTI_Decode(int fs, int maxFs, int* erased) } if (fd == NULL) { FTI_Print("R3 cannot open checkpoint file.", FTI_DBUG); + if (efd) + fclose(efd); return FTI_NSCS; } if (efd == NULL) { FTI_Print("R3 cannot open encoded ckpt. file.", FTI_DBUG); + fclose(fd); return FTI_NSCS; } while (pos < ps) { // Main loop, block by block From dad580e23b9b0608b53ff8ae9b6a44f3fa41cd79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Mon, 1 Feb 2016 17:23:44 +0100 Subject: [PATCH 18/93] trying to fix #18 but it needs more work --- src/postreco.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/postreco.c b/src/postreco.c index a54513357..9ca7969b6 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -235,14 +235,18 @@ int FTI_RecoverL2(int group) FTI_Print(str, FTI_DBUG); sprintf(str, "Opening partner ckpt. file (%s) to recover (L2).", jfn); FTI_Print(str, FTI_DBUG); + lfd = fopen(lfn, "wb"); jfd = fopen(jfn, "wb"); if (lfd == NULL) { FTI_Print("R2 cannot open the checkpoint file.", FTI_DBUG); + if (jfd) + fclose(jfd); return FTI_NSCS; } if (jfd == NULL) { FTI_Print("R2 cannot open the partner ckpt. file.", FTI_DBUG); + fclose(lfd); return FTI_NSCS; } } From 3798e21ec57cade787046cf8cfe9c8ee11cb31ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Mon, 1 Feb 2016 17:25:54 +0100 Subject: [PATCH 19/93] fixes #19 --- src/postreco.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/postreco.c b/src/postreco.c index 9ca7969b6..baf9dd53e 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -461,14 +461,17 @@ int FTI_RecoverL4(int group) FTI_Print("R4 cannot truncate the ckpt. file in the PFS.", FTI_DBUG); return FTI_NSCS; } + gfd = fopen(gfn, "rb"); - lfd = fopen(lfn, "wb"); if (gfd == NULL) { FTI_Print("R4 cannot open the ckpt. file in the PFS.", FTI_DBUG); return FTI_NSCS; } + + lfd = fopen(lfn, "wb"); if (lfd == NULL) { FTI_Print("R4 cannot open the local ckpt. file.", FTI_DBUG); + fclose(gfd); return FTI_NSCS; } From 450c5330357348336147be0dc8ca92c9b59f493d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Mon, 1 Feb 2016 17:28:14 +0100 Subject: [PATCH 20/93] fixes #26 --- src/conf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/conf.c b/src/conf.c index 8565108a7..e6dd064d7 100644 --- a/src/conf.c +++ b/src/conf.c @@ -39,15 +39,18 @@ int FTI_UpdateConf(int restart) FILE* fd = fopen(FTI_Conf.cfgFile, "w"); if (fd == NULL) { FTI_Print("FTI failed to open the configuration file.", FTI_EROR); + iniparser_freedict(ini); return FTI_NSCS; } iniparser_dump_ini(ini, fd); // Write new configuration if (fflush(fd) != 0) { FTI_Print("FTI failed to flush the configuration file.", FTI_EROR); + iniparser_freedict(ini); return FTI_NSCS; } if (fclose(fd) != 0) { FTI_Print("FTI failed to close the configuration file.", FTI_EROR); + iniparser_freedict(ini); return FTI_NSCS; } iniparser_freedict(ini); // Free dictionary From c31d39862c2daa506f029d898faf181ff9e59af5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Mon, 1 Feb 2016 17:29:54 +0100 Subject: [PATCH 21/93] fixes #27 --- src/topo.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/topo.c b/src/topo.c index 64614f3a3..b53efee88 100644 --- a/src/topo.c +++ b/src/topo.c @@ -46,15 +46,18 @@ int FTI_SaveTopo(char* nameList) FILE* fd = fopen(mfn, "w"); if (fd == NULL) { FTI_Print("Topology file could NOT be opened", FTI_WARN); + iniparser_freedict(ini); return FTI_NSCS; } iniparser_dump_ini(ini, fd); // Write new topology if (fflush(fd) != 0) { FTI_Print("Topology file could NOT be flushed.", FTI_WARN); + iniparser_freedict(ini); return FTI_NSCS; } if (fclose(fd) != 0) { FTI_Print("Topology file could NOT be closed.", FTI_WARN); + iniparser_freedict(ini); return FTI_NSCS; } iniparser_freedict(ini); From 627dddaf83376b26bdedcaee71d647a3392244cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Wed, 24 Feb 2016 18:58:51 +0100 Subject: [PATCH 22/93] Fixes #8 --- src/postreco.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 93 insertions(+), 2 deletions(-) diff --git a/src/postreco.c b/src/postreco.c index baf9dd53e..8327b4370 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -22,17 +22,21 @@ int FTI_Decode(int fs, int maxFs, int* erased) int *matrix, *decMatrix, *dm_ids, *tmpmat, i, j, k, m, ps, bs, pos = 0; char **coding, **data, *dataTmp, fn[FTI_BUFS], efn[FTI_BUFS], str[FTI_BUFS]; FILE *fd, *efd; + bs = FTI_Conf.blockSize; k = FTI_Topo.groupSize; m = k; ps = ((maxFs / FTI_Conf.blockSize)) * FTI_Conf.blockSize; if (ps < maxFs) ps = ps + FTI_Conf.blockSize; // Calculating padding size + if (access(FTI_Ckpt[3].dir, F_OK) != 0) mkdir(FTI_Ckpt[3].dir, 0777); + sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &i); sprintf(fn, "%s/%s", FTI_Ckpt[3].dir, FTI_Exec.ckptFile); sprintf(efn, "%s/Ckpt%d-RSed%d.fti", FTI_Ckpt[3].dir, FTI_Exec.ckptID, i); + data = talloc(char*, k); coding = talloc(char*, m); dataTmp = talloc(char, FTI_Conf.blockSize* k); @@ -56,7 +60,8 @@ int FTI_Decode(int fs, int maxFs, int* erased) j++; } } - for (i = 0; i < k; i++) { // Building the matrix + // Building the matrix + for (i = 0; i < k; i++) { if (dm_ids[i] < k) { for (j = 0; j < k; j++) tmpmat[i * k + j] = 0; @@ -66,14 +71,41 @@ int FTI_Decode(int fs, int maxFs, int* erased) for (j = 0; j < k; j++) { tmpmat[i * k + j] = matrix[(dm_ids[i] - k) * k + j]; } - } // Inversing the matrix + } + // Inversing the matrix if (jerasure_invert_matrix(tmpmat, decMatrix, k, FTI_Conf.l3WordSize) < 0) { FTI_Print("Error inversing matrix", FTI_DBUG); + + for (i = 0; i < m; i++) { + free(coding[i]); + free(data[i]); + } + free(tmpmat); + free(dm_ids); + free(decMatrix); + free(matrix); + free(data); + free(dataTmp); + free(coding); + return FTI_NSCS; } if (erased[FTI_Topo.groupRank] == 0) { // Resize and open files if (truncate(fn, ps) == -1) { FTI_Print("Error with truncate on checkpoint file", FTI_DBUG); + + for (i = 0; i < m; i++) { + free(coding[i]); + free(data[i]); + } + free(tmpmat); + free(dm_ids); + free(decMatrix); + free(matrix); + free(data); + free(dataTmp); + free(coding); + return FTI_NSCS; } fd = fopen(fn, "rb"); @@ -87,11 +119,38 @@ int FTI_Decode(int fs, int maxFs, int* erased) FTI_Print("R3 cannot open checkpoint file.", FTI_DBUG); if (efd) fclose(efd); + + for (i = 0; i < m; i++) { + free(coding[i]); + free(data[i]); + } + free(tmpmat); + free(dm_ids); + free(decMatrix); + free(matrix); + free(data); + free(dataTmp); + free(coding); + return FTI_NSCS; } if (efd == NULL) { FTI_Print("R3 cannot open encoded ckpt. file.", FTI_DBUG); + fclose(fd); + + for (i = 0; i < m; i++) { + free(coding[i]); + free(data[i]); + } + free(tmpmat); + free(dm_ids); + free(decMatrix); + free(matrix); + free(data); + free(dataTmp); + free(coding); + return FTI_NSCS; } while (pos < ps) { // Main loop, block by block @@ -126,12 +185,43 @@ int FTI_Decode(int fs, int maxFs, int* erased) fclose(efd); // Closing files if (truncate(fn, fs) == -1) { FTI_Print("R3 cannot re-truncate checkpoint file.", FTI_DBUG); + + for (i = 0; i < m; i++) { + free(coding[i]); + free(data[i]); + } + free(tmpmat); + free(dm_ids); + free(decMatrix); + free(matrix); + free(data); + free(dataTmp); + free(coding); + return FTI_NSCS; } if (truncate(efn, fs) == -1) { FTI_Print("R3 cannot re-truncate encoded ckpt. file.", FTI_DBUG); + + for (i = 0; i < m; i++) { + free(coding[i]); + free(data[i]); + } + free(tmpmat); + free(dm_ids); + free(decMatrix); + free(matrix); + free(data); + free(dataTmp); + free(coding); + return FTI_NSCS; } + + for (i = 0; i < m; i++) { + free(coding[i]); + free(data[i]); + } free(tmpmat); free(dm_ids); free(decMatrix); @@ -139,6 +229,7 @@ int FTI_Decode(int fs, int maxFs, int* erased) free(data); free(dataTmp); free(coding); + return FTI_SCES; } From b2772b3b1589bfcfd7257ec99f552a7b705d3fce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Wed, 24 Feb 2016 19:12:16 +0100 Subject: [PATCH 23/93] Very ugly commit which fixes #10 --- src/postreco.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 116 insertions(+), 3 deletions(-) diff --git a/src/postreco.c b/src/postreco.c index 8327b4370..4982e39f0 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -281,23 +281,35 @@ int FTI_RecoverL2(int group) char str[FTI_BUFS], lfn[FTI_BUFS], pfn[FTI_BUFS], jfn[FTI_BUFS], qfn[FTI_BUFS]; char *blBuf1, *blBuf2, *blBuf3, *blBuf4; unsigned long ps, fs, maxFs, pos = 0; + FILE *lfd, *pfd, *jfd, *qfd; + MPI_Request reqSend1, reqRecv1, reqSend2, reqRecv2; MPI_Status status; + blBuf1 = talloc(char, FTI_Conf.blockSize); blBuf2 = talloc(char, FTI_Conf.blockSize); blBuf3 = talloc(char, FTI_Conf.blockSize); blBuf4 = talloc(char, FTI_Conf.blockSize); + gs = FTI_Topo.groupSize; src = FTI_Topo.left; dest = FTI_Topo.right; + if (access(FTI_Ckpt[2].dir, F_OK) != 0) mkdir(FTI_Ckpt[2].dir, 0777); - if (FTI_CheckErasures(&fs, &maxFs, group, erased, 2) != FTI_SCES) // Checking erasures - { + // Checking erasures + if (FTI_CheckErasures(&fs, &maxFs, group, erased, 2) != FTI_SCES) { FTI_Print("Error checking erasures.", FTI_DBUG); + + free(blBuf1); + free(blBuf2); + free(blBuf3); + free(blBuf4); + return FTI_NSCS; } + buf = -1; for (j = 0; j < gs; j++) if (erased[j] && erased[((j + 1) % gs) + gs]) @@ -305,8 +317,15 @@ int FTI_RecoverL2(int group) sprintf(str, "A checkpoint file and its partner copy (ID in group : %d) have been lost", buf); if (buf > -1) { FTI_Print(str, FTI_DBUG); + + free(blBuf1); + free(blBuf2); + free(blBuf3); + free(blBuf4); + return FTI_NSCS; } + buf = 0; for (j = 0; j < gs * 2; j++) if (erased[j]) @@ -333,11 +352,23 @@ int FTI_RecoverL2(int group) FTI_Print("R2 cannot open the checkpoint file.", FTI_DBUG); if (jfd) fclose(jfd); + + free(blBuf1); + free(blBuf2); + free(blBuf3); + free(blBuf4); + return FTI_NSCS; } if (jfd == NULL) { FTI_Print("R2 cannot open the partner ckpt. file.", FTI_DBUG); fclose(lfd); + + free(blBuf1); + free(blBuf2); + free(blBuf3); + free(blBuf4); + return FTI_NSCS; } } @@ -348,11 +379,23 @@ int FTI_RecoverL2(int group) FTI_Print(str, FTI_DBUG); if (truncate(pfn, ps) == -1) { FTI_Print("R2 cannot truncate the partner ckpt. file.", FTI_DBUG); + + free(blBuf1); + free(blBuf2); + free(blBuf3); + free(blBuf4); + return FTI_NSCS; } pfd = fopen(pfn, "rb"); if (pfd == NULL) { FTI_Print("R2 cannot open partner ckpt. file.", FTI_DBUG); + + free(blBuf1); + free(blBuf2); + free(blBuf3); + free(blBuf4); + return FTI_NSCS; } } @@ -362,11 +405,23 @@ int FTI_RecoverL2(int group) FTI_Print(str, FTI_DBUG); if (truncate(qfn, ps) == -1) { FTI_Print("R2 cannot truncate the ckpt. file.", FTI_DBUG); + + free(blBuf1); + free(blBuf2); + free(blBuf3); + free(blBuf4); + return FTI_NSCS; } qfd = fopen(qfn, "rb"); if (qfd == NULL) { FTI_Print("R2 cannot open ckpt. file.", FTI_DBUG); + + free(blBuf1); + free(blBuf2); + free(blBuf3); + free(blBuf4); + return FTI_NSCS; } } @@ -392,10 +447,22 @@ int FTI_RecoverL2(int group) MPI_Wait(&reqRecv2, &status); if (fwrite(blBuf2, sizeof(char), FTI_Conf.blockSize, lfd) != FTI_Conf.blockSize) { FTI_Print("Errors writting the data in the R2 checkpoint file.", FTI_DBUG); + + free(blBuf1); + free(blBuf2); + free(blBuf3); + free(blBuf4); + return FTI_NSCS; } if (fwrite(blBuf4, sizeof(char), FTI_Conf.blockSize, jfd) != FTI_Conf.blockSize) { FTI_Print("Errors writting the data in the R2 partner ckpt. file.", FTI_DBUG); + + free(blBuf1); + free(blBuf2); + free(blBuf3); + free(blBuf4); + return FTI_NSCS; } } @@ -404,6 +471,12 @@ int FTI_RecoverL2(int group) if (erased[FTI_Topo.groupRank]) { // Close files if (fclose(lfd) != 0) { FTI_Print("R2 cannot close the checkpoint file.", FTI_DBUG); + + free(blBuf1); + free(blBuf2); + free(blBuf3); + free(blBuf4); + return FTI_NSCS; } if (truncate(lfn, fs) == -1) { @@ -412,36 +485,76 @@ int FTI_RecoverL2(int group) } if (fclose(jfd) != 0) { FTI_Print("R2 cannot close the partner ckpt. file.", FTI_DBUG); + + free(blBuf1); + free(blBuf2); + free(blBuf3); + free(blBuf4); + return FTI_NSCS; } if (truncate(jfn, fs) == -1) { FTI_Print("R2 cannot re-truncate the partner ckpt. file.", FTI_DBUG); + + free(blBuf1); + free(blBuf2); + free(blBuf3); + free(blBuf4); + return FTI_NSCS; } } if (erased[src] && !erased[gs + FTI_Topo.groupRank]) { if (fclose(pfd) != 0) { FTI_Print("R2 cannot close the partner ckpt. file", FTI_DBUG); + + free(blBuf1); + free(blBuf2); + free(blBuf3); + free(blBuf4); + return FTI_NSCS; } if (truncate(pfn, fs) == -1) { FTI_Print("R2 cannot re-truncate the partner ckpt. file.", FTI_DBUG); + + free(blBuf1); + free(blBuf2); + free(blBuf3); + free(blBuf4); + return FTI_NSCS; } } if (erased[dest] && !erased[gs + FTI_Topo.groupRank]) { if (fclose(qfd) != 0) { FTI_Print("R2 cannot close the ckpt. file", FTI_DBUG); + + free(blBuf1); + free(blBuf2); + free(blBuf3); + free(blBuf4); + return FTI_NSCS; } if (truncate(qfn, fs) == -1) { FTI_Print("R2 cannot re-truncate the ckpt. file.", FTI_DBUG); + + free(blBuf1); + free(blBuf2); + free(blBuf3); + free(blBuf4); + return FTI_NSCS; } } } + free(blBuf1); - free(blBuf2); // Free memory + free(blBuf2); + free(blBuf3); + free(blBuf4); + return FTI_SCES; } From 5b7814dbd124b20c124386f044c107a53f451919 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Wed, 24 Feb 2016 19:15:45 +0100 Subject: [PATCH 24/93] Fixes #11 --- deps/jerasure/jerasure.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/deps/jerasure/jerasure.c b/deps/jerasure/jerasure.c index 0b81527f3..d5e0f1015 100644 --- a/deps/jerasure/jerasure.c +++ b/deps/jerasure/jerasure.c @@ -260,7 +260,10 @@ int *jerasure_matrix_to_bitmatrix(int k, int m, int w, int *matrix) int rowelts, rowindex, colindex, elt, i, j, l, x; bitmatrix = talloc(int, k*m*w*w); - if (matrix == NULL) { return NULL; } + if (matrix == NULL) { + free(bitmatrix); + return NULL; + } rowelts = k * w; rowindex = 0; @@ -825,7 +828,11 @@ static int **jerasure_generate_decoding_schedule(int k, int m, int w, int *bitma row_ids = talloc(int, k+m); ind_to_row = talloc(int, k+m); - if (set_up_ids_for_scheduled_decoding(k, m, erasures, row_ids, ind_to_row) < 0) return NULL; + if (set_up_ids_for_scheduled_decoding(k, m, erasures, row_ids, ind_to_row) < 0) { + free(row_ids); + free(ind_to_row); + return NULL; + } /* Now, we're going to create one decoding matrix which is going to decode everything with one call. The hope is that the scheduler From a71e3bd905d4941db99e94971a69b8a1f4a675f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Wed, 24 Feb 2016 19:35:45 +0100 Subject: [PATCH 25/93] Yet another ugly commit, probably fixes #18 --- src/postreco.c | 117 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 109 insertions(+), 8 deletions(-) diff --git a/src/postreco.c b/src/postreco.c index 4982e39f0..65eb65852 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -337,7 +337,9 @@ int FTI_RecoverL2(int group) ps = ps + FTI_Conf.blockSize; // Calculating padding size sprintf(str, "File size: %ld, max. file size : %ld and padding size : %ld.", fs, maxFs, ps); FTI_Print(str, FTI_DBUG); - if (erased[FTI_Topo.groupRank]) { // Open checkpoint file to recover + + // Open checkpoint file to recover + if (erased[FTI_Topo.groupRank]) { sprintf(lfn, "%s/%s", FTI_Ckpt[2].dir, FTI_Exec.ckptFile); sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &buf); sprintf(jfn, "%s/Ckpt%d-Pcof%d.fti", FTI_Ckpt[2].dir, FTI_Exec.ckptID, buf); @@ -347,11 +349,8 @@ int FTI_RecoverL2(int group) FTI_Print(str, FTI_DBUG); lfd = fopen(lfn, "wb"); - jfd = fopen(jfn, "wb"); if (lfd == NULL) { FTI_Print("R2 cannot open the checkpoint file.", FTI_DBUG); - if (jfd) - fclose(jfd); free(blBuf1); free(blBuf2); @@ -360,8 +359,11 @@ int FTI_RecoverL2(int group) return FTI_NSCS; } + + jfd = fopen(jfn, "wb"); if (jfd == NULL) { FTI_Print("R2 cannot open the partner ckpt. file.", FTI_DBUG); + fclose(lfd); free(blBuf1); @@ -372,14 +374,22 @@ int FTI_RecoverL2(int group) return FTI_NSCS; } } - if (erased[src] && !erased[gs + FTI_Topo.groupRank]) { // Truncate and open partner file to transfer + + // Truncate and open partner file to transfer + if (erased[src] && !erased[gs + FTI_Topo.groupRank]) { sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &buf); sprintf(pfn, "%s/Ckpt%d-Pcof%d.fti", FTI_Ckpt[2].dir, FTI_Exec.ckptID, buf); sprintf(str, "Opening partner ckpt. file (%s) to transfer (L2).", pfn); FTI_Print(str, FTI_DBUG); + if (truncate(pfn, ps) == -1) { FTI_Print("R2 cannot truncate the partner ckpt. file.", FTI_DBUG); + if (jfd) + fclose(jfd); + if (lfd) + fclose(lfd); + free(blBuf1); free(blBuf2); free(blBuf3); @@ -387,10 +397,16 @@ int FTI_RecoverL2(int group) return FTI_NSCS; } + pfd = fopen(pfn, "rb"); if (pfd == NULL) { FTI_Print("R2 cannot open partner ckpt. file.", FTI_DBUG); + if (jfd) + fclose(jfd); + if (lfd) + fclose(lfd); + free(blBuf1); free(blBuf2); free(blBuf3); @@ -399,13 +415,23 @@ int FTI_RecoverL2(int group) return FTI_NSCS; } } - if (erased[dest] && !erased[gs + FTI_Topo.groupRank]) { // Truncate and open partner file to transfer + + // Truncate and open partner file to transfer + if (erased[dest] && !erased[gs + FTI_Topo.groupRank]) { sprintf(qfn, "%s/%s", FTI_Ckpt[2].dir, FTI_Exec.ckptFile); sprintf(str, "Opening ckpt. file (%s) to transfer (L2).", qfn); FTI_Print(str, FTI_DBUG); + if (truncate(qfn, ps) == -1) { FTI_Print("R2 cannot truncate the ckpt. file.", FTI_DBUG); + if (jfd) + fclose(jfd); + if (lfd) + fclose(lfd); + if (pfd) + fclose(pfd); + free(blBuf1); free(blBuf2); free(blBuf3); @@ -413,10 +439,18 @@ int FTI_RecoverL2(int group) return FTI_NSCS; } + qfd = fopen(qfn, "rb"); if (qfd == NULL) { FTI_Print("R2 cannot open ckpt. file.", FTI_DBUG); + if (jfd) + fclose(jfd); + if (lfd) + fclose(lfd); + if (pfd) + fclose(pfd); + free(blBuf1); free(blBuf2); free(blBuf3); @@ -425,7 +459,9 @@ int FTI_RecoverL2(int group) return FTI_NSCS; } } - while (pos < ps) { // Checkpoint files exchange + + // Checkpoint files exchange + while (pos < ps) { if (erased[src] && !erased[gs + FTI_Topo.groupRank]) { fread(blBuf1, sizeof(char), FTI_Conf.blockSize, pfd); MPI_Isend(blBuf1, FTI_Conf.blockSize, MPI_CHAR, src, FTI_Conf.tag, FTI_Exec.groupComm, &reqSend1); @@ -448,6 +484,15 @@ int FTI_RecoverL2(int group) if (fwrite(blBuf2, sizeof(char), FTI_Conf.blockSize, lfd) != FTI_Conf.blockSize) { FTI_Print("Errors writting the data in the R2 checkpoint file.", FTI_DBUG); + if (jfd) + fclose(jfd); + if (lfd) + fclose(lfd); + if (pfd) + fclose(pfd); + if (qfd) + fclose(qfd); + free(blBuf1); free(blBuf2); free(blBuf3); @@ -458,6 +503,15 @@ int FTI_RecoverL2(int group) if (fwrite(blBuf4, sizeof(char), FTI_Conf.blockSize, jfd) != FTI_Conf.blockSize) { FTI_Print("Errors writting the data in the R2 partner ckpt. file.", FTI_DBUG); + if (jfd) + fclose(jfd); + if (lfd) + fclose(lfd); + if (pfd) + fclose(pfd); + if (qfd) + fclose(qfd); + free(blBuf1); free(blBuf2); free(blBuf3); @@ -468,10 +522,21 @@ int FTI_RecoverL2(int group) } pos = pos + FTI_Conf.blockSize; } - if (erased[FTI_Topo.groupRank]) { // Close files + + // Close files + if (erased[FTI_Topo.groupRank]) { if (fclose(lfd) != 0) { FTI_Print("R2 cannot close the checkpoint file.", FTI_DBUG); + if (jfd) + fclose(jfd); + if (lfd) + fclose(lfd); + if (pfd) + fclose(pfd); + if (qfd) + fclose(qfd); + free(blBuf1); free(blBuf2); free(blBuf3); @@ -481,11 +546,30 @@ int FTI_RecoverL2(int group) } if (truncate(lfn, fs) == -1) { FTI_Print("R2 cannot re-truncate the checkpoint file.", FTI_DBUG); + + if (jfd) + fclose(jfd); + if (pfd) + fclose(pfd); + if (qfd) + fclose(qfd); + + free(blBuf1); + free(blBuf2); + free(blBuf3); + free(blBuf4); + return FTI_NSCS; } + if (fclose(jfd) != 0) { FTI_Print("R2 cannot close the partner ckpt. file.", FTI_DBUG); + if (pfd) + fclose(pfd); + if (qfd) + fclose(qfd); + free(blBuf1); free(blBuf2); free(blBuf3); @@ -496,6 +580,15 @@ int FTI_RecoverL2(int group) if (truncate(jfn, fs) == -1) { FTI_Print("R2 cannot re-truncate the partner ckpt. file.", FTI_DBUG); + if (jfd) + fclose(jfd); + if (lfd) + fclose(lfd); + if (pfd) + fclose(pfd); + if (qfd) + fclose(qfd); + free(blBuf1); free(blBuf2); free(blBuf3); @@ -504,10 +597,14 @@ int FTI_RecoverL2(int group) return FTI_NSCS; } } + if (erased[src] && !erased[gs + FTI_Topo.groupRank]) { if (fclose(pfd) != 0) { FTI_Print("R2 cannot close the partner ckpt. file", FTI_DBUG); + if (qfd) + fclose(qfd); + free(blBuf1); free(blBuf2); free(blBuf3); @@ -518,6 +615,9 @@ int FTI_RecoverL2(int group) if (truncate(pfn, fs) == -1) { FTI_Print("R2 cannot re-truncate the partner ckpt. file.", FTI_DBUG); + if (qfd) + fclose(qfd); + free(blBuf1); free(blBuf2); free(blBuf3); @@ -526,6 +626,7 @@ int FTI_RecoverL2(int group) return FTI_NSCS; } } + if (erased[dest] && !erased[gs + FTI_Topo.groupRank]) { if (fclose(qfd) != 0) { FTI_Print("R2 cannot close the ckpt. file", FTI_DBUG); From 63720a5bd2de6d253bc44dfa14aee6db8a477886 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Wed, 24 Feb 2016 20:00:59 +0100 Subject: [PATCH 26/93] Fixes #20 --- examples/CMakeLists.txt | 6 +++--- examples/Makefile | 4 ++-- examples/heatd2.c | 1 - examples/heatdis.c | 1 - 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index c55480a6c..d720c1b59 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -2,7 +2,7 @@ link_directories(${CMAKE_BINARY_DIR}/lib) add_executable(hd.exe heatdis.c) -target_link_libraries(hd.exe fti ${MPI_C_LIBRARIES}) +target_link_libraries(hd.exe fti ${MPI_C_LIBRARIES} m) if(MPI_C_COMPILE_FLAGS) set_target_properties(hd.exe PROPERTIES @@ -16,7 +16,7 @@ endif() add_executable(hd2.exe heatd2.c) -target_link_libraries(hd2.exe fti ${MPI_C_LIBRARIES}) +target_link_libraries(hd2.exe fti ${MPI_C_LIBRARIES} m) if(MPI_C_COMPILE_FLAGS) set_target_properties(hd2.exe PROPERTIES @@ -31,7 +31,7 @@ endif() if(ENABLE_FORTRAN) add_executable(hdf.exe fheatdis.f90) - target_link_libraries(hdf.exe fti_f90 fti ${MPI_Fortran_LIBRARIES}) + target_link_libraries(hdf.exe fti_f90 fti ${MPI_Fortran_LIBRARIES} m) if(MPI_Fortran_COMPILE_FLAGS) set_target_properties(hdf.exe PROPERTIES diff --git a/examples/Makefile b/examples/Makefile index 776d1d0b3..6a58f8c72 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -14,8 +14,8 @@ MPIRUN ?= mpirun FTIFLAG = -I$(FTIPATH)/include -L$(FTIPATH)/lib -lfti -lm FFTIFLAG = -I$(FTIPATH)/include -L$(FTIPATH)/lib -lfti_f90 -lfti -lm # Compiling using static library -#FTIFLAG = -I$(FTIPATH)/include $(FTIPATH)/lib/libfti.a -#FFTIFLAG = -I$(FTIPATH)/include $(FTIPATH)/lib/libfti_f90.a $(FTIPATH)/lib/libfti.a +#FTIFLAG = -I$(FTIPATH)/include $(FTIPATH)/lib/libfti.a -lm +#FFTIFLAG = -I$(FTIPATH)/include $(FTIPATH)/lib/libfti_f90.a $(FTIPATH)/lib/libfti.a -lm ## TARGETS all: hd hd2 hdf diff --git a/examples/heatd2.c b/examples/heatd2.c index d93fa6f52..05ab95191 100644 --- a/examples/heatd2.c +++ b/examples/heatd2.c @@ -8,7 +8,6 @@ #include #include -#include #include #include diff --git a/examples/heatdis.c b/examples/heatdis.c index 88d444f20..9783be205 100644 --- a/examples/heatdis.c +++ b/examples/heatdis.c @@ -8,7 +8,6 @@ #include #include -#include #include #include From b8c7d97868e7bf13e1691a6bb96a5ca86819210a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Wed, 24 Feb 2016 20:04:06 +0100 Subject: [PATCH 27/93] Fixes #21 --- src/conf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/conf.c b/src/conf.c index e6dd064d7..4f680c3e1 100644 --- a/src/conf.c +++ b/src/conf.c @@ -182,7 +182,7 @@ int FTI_TestConfig() return FTI_NSCS; } if (FTI_Topo.nbNodes % FTI_Topo.groupSize != 0) { - FTI_Print("The group size is not multiple of the number of nodes.", FTI_WARN); + FTI_Print("The number of nodes is not multiple of the group size.", FTI_WARN); return FTI_NSCS; } if (FTI_Topo.groupSize <= 2) { From 3e24c7761a5e00922563ef258cb739a4285c8dcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Wed, 24 Feb 2016 21:35:40 +0100 Subject: [PATCH 28/93] Removing introduced errors --- src/postreco.c | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/src/postreco.c b/src/postreco.c index 65eb65852..a6368fdde 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -490,8 +490,6 @@ int FTI_RecoverL2(int group) fclose(lfd); if (pfd) fclose(pfd); - if (qfd) - fclose(qfd); free(blBuf1); free(blBuf2); @@ -509,8 +507,6 @@ int FTI_RecoverL2(int group) fclose(lfd); if (pfd) fclose(pfd); - if (qfd) - fclose(qfd); free(blBuf1); free(blBuf2); @@ -530,12 +526,8 @@ int FTI_RecoverL2(int group) if (jfd) fclose(jfd); - if (lfd) - fclose(lfd); if (pfd) fclose(pfd); - if (qfd) - fclose(qfd); free(blBuf1); free(blBuf2); @@ -551,8 +543,6 @@ int FTI_RecoverL2(int group) fclose(jfd); if (pfd) fclose(pfd); - if (qfd) - fclose(qfd); free(blBuf1); free(blBuf2); @@ -567,8 +557,6 @@ int FTI_RecoverL2(int group) if (pfd) fclose(pfd); - if (qfd) - fclose(qfd); free(blBuf1); free(blBuf2); @@ -580,14 +568,8 @@ int FTI_RecoverL2(int group) if (truncate(jfn, fs) == -1) { FTI_Print("R2 cannot re-truncate the partner ckpt. file.", FTI_DBUG); - if (jfd) - fclose(jfd); - if (lfd) - fclose(lfd); if (pfd) fclose(pfd); - if (qfd) - fclose(qfd); free(blBuf1); free(blBuf2); @@ -602,9 +584,6 @@ int FTI_RecoverL2(int group) if (fclose(pfd) != 0) { FTI_Print("R2 cannot close the partner ckpt. file", FTI_DBUG); - if (qfd) - fclose(qfd); - free(blBuf1); free(blBuf2); free(blBuf3); @@ -615,9 +594,6 @@ int FTI_RecoverL2(int group) if (truncate(pfn, fs) == -1) { FTI_Print("R2 cannot re-truncate the partner ckpt. file.", FTI_DBUG); - if (qfd) - fclose(qfd); - free(blBuf1); free(blBuf2); free(blBuf3); From 90125916bf2642a042b2346442a2fc8aedbf5a97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Thu, 25 Feb 2016 10:59:43 +0100 Subject: [PATCH 29/93] Fixing several uninitialized pointer read --- src/postreco.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/postreco.c b/src/postreco.c index a6368fdde..e11725fbc 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -282,7 +282,7 @@ int FTI_RecoverL2(int group) char *blBuf1, *blBuf2, *blBuf3, *blBuf4; unsigned long ps, fs, maxFs, pos = 0; - FILE *lfd, *pfd, *jfd, *qfd; + FILE *lfd = NULL, *pfd = NULL, *jfd = NULL, *qfd = NULL; MPI_Request reqSend1, reqRecv1, reqSend2, reqRecv2; MPI_Status status; From 309fde672b21bcbe6e72e8245fc2b6b59d86a9df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Thu, 25 Feb 2016 11:02:55 +0100 Subject: [PATCH 30/93] Fixing uninitialized scalar variable bug --- src/topo.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/topo.c b/src/topo.c index b53efee88..fecd734d1 100644 --- a/src/topo.c +++ b/src/topo.c @@ -277,7 +277,7 @@ int FTI_CreateComms(int* userProcList, int* distProcList, int* nodeList) /*-------------------------------------------------------------------------*/ int FTI_Topology() { - int res, nn, found, c1 = 0, c2 = 0, p, i, mypos, posInNode; + int res, nn, found, c1 = 0, c2 = 0, p, i, mypos = -1, posInNode; char str[FTI_BUFS], *nameList = talloc(char, FTI_Topo.nbNodes *FTI_BUFS); int* nodeList = talloc(int, FTI_Topo.nbNodes* FTI_Topo.nodeSize); for (i = 0; i < FTI_Topo.nbProc; i++) { @@ -319,6 +319,9 @@ int FTI_Topology() c2++; } } + if (mypos == -1) + return FTI_NSCS; + FTI_Topo.nodeRank = mypos % FTI_Topo.nodeSize; if (FTI_Topo.nodeRank == 0 && FTI_Topo.nbHeads == 1) { FTI_Topo.amIaHead = 1; From a03be5104bdd0860f16c7d943259491714f05f22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Thu, 25 Feb 2016 11:06:10 +0100 Subject: [PATCH 31/93] Fixing resource leak --- src/postreco.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/postreco.c b/src/postreco.c index e11725fbc..d6e76365a 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -594,6 +594,9 @@ int FTI_RecoverL2(int group) if (truncate(pfn, fs) == -1) { FTI_Print("R2 cannot re-truncate the partner ckpt. file.", FTI_DBUG); + if (qfd) + fclose(qfd); + free(blBuf1); free(blBuf2); free(blBuf3); From f1a16efb6adce4ab304846fe7be41758b0c4aa20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Thu, 25 Feb 2016 11:10:10 +0100 Subject: [PATCH 32/93] Fixing resource leak --- src/topo.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/src/topo.c b/src/topo.c index fecd734d1..8bb4b3fea 100644 --- a/src/topo.c +++ b/src/topo.c @@ -24,43 +24,66 @@ int FTI_SaveTopo(char* nameList) char mfn[FTI_BUFS], str[FTI_BUFS]; dictionary* ini; int i; + sprintf(str, "Trying to load configuration file (%s) to create topology.", FTI_Conf.cfgFile); FTI_Print(str, FTI_DBUG); + ini = iniparser_load(FTI_Conf.cfgFile); if (ini == NULL) { FTI_Print("Iniparser cannot parse the configuration file.", FTI_WARN); + return FTI_NSCS; } - iniparser_set(ini, "topology", NULL); // Set topology section - for (i = 0; i < FTI_Topo.nbNodes; i++) { // Write list of nodes + + // Set topology section + iniparser_set(ini, "topology", NULL); + + // Write list of nodes + for (i = 0; i < FTI_Topo.nbNodes; i++) { strncpy(mfn, nameList + (i * FTI_BUFS), FTI_BUFS); sprintf(str, "topology:%d", i); iniparser_set(ini, str, mfn); - } // Unset sections of the configuration file + } + + // Unset sections of the configuration file iniparser_unset(ini, "basic"); iniparser_unset(ini, "restart"); iniparser_unset(ini, "advanced"); + sprintf(mfn, "%s/Topology.fti", FTI_Conf.metadDir); sprintf(str, "Creating topology file (%s)...", mfn); FTI_Print(str, FTI_DBUG); + FILE* fd = fopen(mfn, "w"); if (fd == NULL) { FTI_Print("Topology file could NOT be opened", FTI_WARN); + iniparser_freedict(ini); + return FTI_NSCS; } - iniparser_dump_ini(ini, fd); // Write new topology + + // Write new topology + iniparser_dump_ini(ini, fd); + if (fflush(fd) != 0) { FTI_Print("Topology file could NOT be flushed.", FTI_WARN); + iniparser_freedict(ini); + fclose(fd); + return FTI_NSCS; } if (fclose(fd) != 0) { FTI_Print("Topology file could NOT be closed.", FTI_WARN); + iniparser_freedict(ini); + return FTI_NSCS; } + iniparser_freedict(ini); + return FTI_SCES; } From 46068cbc79cda24a3076a621de732ab9a09ea094 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Thu, 25 Feb 2016 11:16:00 +0100 Subject: [PATCH 33/93] Fixing several resource leak --- src/topo.c | 56 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/src/topo.c b/src/topo.c index 8bb4b3fea..7eb9e0afe 100644 --- a/src/topo.c +++ b/src/topo.c @@ -104,6 +104,7 @@ int FTI_ReorderNodes(int* nodeList, char* nameList) { char mfn[FTI_BUFS], str[FTI_BUFS], *tmp; int i, j, *nl, *old, *new; + nl = talloc(int, FTI_Topo.nbProc); old = talloc(int, FTI_Topo.nbNodes); new = talloc(int, FTI_Topo.nbNodes); @@ -111,55 +112,84 @@ int FTI_ReorderNodes(int* nodeList, char* nameList) old[i] = -1; new[i] = -1; } + sprintf(mfn, "%s/Topology.fti", FTI_Conf.metadDir); sprintf(str, "Loading FTI topology file (%s) to reorder nodes...", mfn); FTI_Print(str, FTI_DBUG); - if (access(mfn, F_OK) != 0) { // Checking that the topology file exist + + // Checking that the topology file exist + if (access(mfn, F_OK) != 0) { FTI_Print("The topology file is NOT accessible.", FTI_WARN); + + free(nl); + free(old); + free(new); + return FTI_NSCS; } + dictionary* ini; ini = iniparser_load(mfn); if (ini == NULL) { FTI_Print("Iniparser could NOT parse the topology file.", FTI_WARN); + + free(nl); + free(old); + free(new); + return FTI_NSCS; } - for (i = 0; i < FTI_Topo.nbNodes; i++) { // Get the old order of nodes + + // Get the old order of nodes + for (i = 0; i < FTI_Topo.nbNodes; i++) { sprintf(str, "Topology:%d", i); tmp = iniparser_getstring(ini, str, NULL); snprintf(str, FTI_BUFS, "%s", tmp); - for (j = 0; j < FTI_Topo.nbNodes; j++) { // Search for same node in current nameList - if (strncmp(str, nameList + (j * FTI_BUFS), FTI_BUFS) == 0) // If found... - { + + // Search for same node in current nameList + for (j = 0; j < FTI_Topo.nbNodes; j++) { + // If found... + if (strncmp(str, nameList + (j * FTI_BUFS), FTI_BUFS) == 0) { old[j] = i; new[i] = j; break; } // ...set matching IDs and break out of the searching loop } } + iniparser_freedict(ini); + j = 0; - for (i = 0; i < FTI_Topo.nbNodes; i++) { // Introducing missing nodes - if (new[i] == -1) { // For each new node.. - while (old[j] != -1) { // ..search for an old node not present in the new list... + // Introducing missing nodes + for (i = 0; i < FTI_Topo.nbNodes; i++) { + // For each new node.. + if (new[i] == -1) { + // ..search for an old node not present in the new list.. + while (old[j] != -1) { j++; - } // .. and set matching IDs + } + // .. and set matching IDs old[j] = i; new[i] = j; j++; } } - for (i = 0; i < FTI_Topo.nbProc; i++) { // Copying nodeList in nl + // Copying nodeList in nl + for (i = 0; i < FTI_Topo.nbProc; i++) { nl[i] = nodeList[i]; } - for (i = 0; i < FTI_Topo.nbNodes; i++) { // Creating the new nodeList with the old order + // Creating the new nodeList with the old order + for (i = 0; i < FTI_Topo.nbNodes; i++) { for (j = 0; j < FTI_Topo.nodeSize; j++) { nodeList[(i * FTI_Topo.nodeSize) + j] = nl[(new[i] * FTI_Topo.nodeSize) + j]; } - } // Free memory + } + + // Free memory + free(nl); free(old); free(new); - free(nl); + return FTI_SCES; } From a08534804e9a459b86cdcd55b1034a7a8024fdba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Thu, 25 Feb 2016 13:09:11 +0100 Subject: [PATCH 34/93] Fixing resource leak --- src/meta.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/src/meta.c b/src/meta.c index 06b8628f9..390217da7 100644 --- a/src/meta.c +++ b/src/meta.c @@ -78,15 +78,21 @@ int FTI_WriteMetadata(unsigned long* fs, unsigned long mfs, char* fnl) char str[FTI_BUFS], buf[FTI_BUFS]; dictionary* ini; int i; + snprintf(buf, FTI_BUFS, "%s/Topology.fti", FTI_Conf.metadDir); sprintf(str, "Temporary load of topology file (%s)...", buf); FTI_Print(str, FTI_DBUG); - ini = iniparser_load(buf); // To bypass iniparser bug while empty dict. + + // To bypass iniparser bug while empty dict. + ini = iniparser_load(buf); if (ini == NULL) { FTI_Print("Temporary topology file could NOT be parsed", FTI_WARN); + return FTI_NSCS; } - for (i = 0; i < FTI_Topo.groupSize; i++) { // Add metadata to dictionary + + // Add metadata to dictionary + for (i = 0; i < FTI_Topo.groupSize; i++) { strncpy(buf, fnl + (i * FTI_BUFS), FTI_BUFS); sprintf(str, "%d", i); iniparser_set(ini, str, NULL); @@ -99,32 +105,47 @@ int FTI_WriteMetadata(unsigned long* fs, unsigned long mfs, char* fnl) sprintf(buf, "%ld", mfs); iniparser_set(ini, str, buf); } - iniparser_unset(ini, "topology"); // Remove topology section + + // Remove topology section + iniparser_unset(ini, "topology"); if (access(FTI_Conf.mTmpDir, F_OK) != 0) { mkdir(FTI_Conf.mTmpDir, 0777); } + sprintf(buf, "%s/sector%d-group%d.fti", FTI_Conf.mTmpDir, FTI_Topo.sectorID, FTI_Topo.groupID); remove(buf); sprintf(str, "Creating metadata file (%s)...", buf); FTI_Print(str, FTI_DBUG); + FILE* fd = fopen(buf, "w"); if (fd == NULL) { FTI_Print("Metadata file could NOT be opened.", FTI_WARN); + iniparser_freedict(ini); + return FTI_NSCS; } - iniparser_dump_ini(ini, fd); // Write metadata + + // Write metadata + iniparser_dump_ini(ini, fd); + if (fflush(fd) != 0) { FTI_Print("Metadata file could NOT be flushed.", FTI_WARN); + iniparser_freedict(ini); + fclose(fd); + return FTI_NSCS; } if (fclose(fd) != 0) { FTI_Print("Metadata file could NOT be closed.", FTI_WARN); + iniparser_freedict(ini); + return FTI_NSCS; } iniparser_freedict(ini); + return FTI_SCES; } From c15d6d33c6711e009fbb219dd9f85a897e995060 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Thu, 25 Feb 2016 13:13:11 +0100 Subject: [PATCH 35/93] Fixing resource leak --- src/conf.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/src/conf.c b/src/conf.c index 4f680c3e1..eb448f220 100644 --- a/src/conf.c +++ b/src/conf.c @@ -26,34 +26,53 @@ int FTI_UpdateConf(int restart) { char str[FTI_BUFS]; dictionary* ini; - ini = iniparser_load(FTI_Conf.cfgFile); // Load dictionary + + // Load dictionary + ini = iniparser_load(FTI_Conf.cfgFile); sprintf(str, "Updating configuration file (%s)...", FTI_Conf.cfgFile); FTI_Print(str, FTI_DBUG); if (ini == NULL) { FTI_Print("Iniparser failed to parse the conf. file.", FTI_WARN); + return FTI_NSCS; } + sprintf(str, "%d", restart); - iniparser_set(ini, "Restart:failure", str); // Set failure to 'restart' - iniparser_set(ini, "Restart:exec_id", FTI_Exec.id); // Set the exec. ID + // Set failure to 'restart' + iniparser_set(ini, "Restart:failure", str); + // Set the exec. ID + iniparser_set(ini, "Restart:exec_id", FTI_Exec.id); + FILE* fd = fopen(FTI_Conf.cfgFile, "w"); if (fd == NULL) { FTI_Print("FTI failed to open the configuration file.", FTI_EROR); + iniparser_freedict(ini); + return FTI_NSCS; } - iniparser_dump_ini(ini, fd); // Write new configuration + + // Write new configuration + iniparser_dump_ini(ini, fd); if (fflush(fd) != 0) { FTI_Print("FTI failed to flush the configuration file.", FTI_EROR); + iniparser_freedict(ini); + fclose(fd); + return FTI_NSCS; } if (fclose(fd) != 0) { FTI_Print("FTI failed to close the configuration file.", FTI_EROR); + iniparser_freedict(ini); + return FTI_NSCS; } - iniparser_freedict(ini); // Free dictionary + + // Free dictionary + iniparser_freedict(ini); + return FTI_SCES; } From 2a9bc9db428ab68e6341529877451612d936d593 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Thu, 25 Feb 2016 13:19:02 +0100 Subject: [PATCH 36/93] Fixing resource leak --- src/checkpoint.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/checkpoint.c b/src/checkpoint.c index 1fca8dd2c..6bcef3b21 100644 --- a/src/checkpoint.c +++ b/src/checkpoint.c @@ -72,6 +72,7 @@ int FTI_WriteCkpt(FTIT_dataset* FTI_Data) FILE* fd; double tt = MPI_Wtime(); char fn[FTI_BUFS], str[FTI_BUFS]; + snprintf(FTI_Exec.ckptFile, FTI_BUFS, "Ckpt%d-Rank%d.fti", FTI_Exec.ckptID, FTI_Topo.myRank); if (FTI_Ckpt[4].isInline && FTI_Exec.ckptLvel == 4) { sprintf(fn, "%s/%s", FTI_Conf.gTmpDir, FTI_Exec.ckptFile); @@ -81,31 +82,40 @@ int FTI_WriteCkpt(FTIT_dataset* FTI_Data) sprintf(fn, "%s/%s", FTI_Conf.lTmpDir, FTI_Exec.ckptFile); mkdir(FTI_Conf.lTmpDir, 0777); } + fd = fopen(fn, "wb"); if (fd == NULL) { FTI_Print("FTI checkpoint file could not be opened.", FTI_EROR); + return FTI_NSCS; } for (i = 0; i < FTI_Exec.nbVar; i++) { if (fwrite(FTI_Data[i].ptr, FTI_Data[i].eleSize, FTI_Data[i].count, fd) != FTI_Data[i].count) { sprintf(str, "Dataset #%d could not be written.", FTI_Data[i].id); FTI_Print(str, FTI_EROR); + fclose(fd); + return FTI_NSCS; } } if (fflush(fd) != 0) { FTI_Print("FTI checkpoint file could not be flushed.", FTI_EROR); + + fclose(fd); + return FTI_NSCS; } if (fclose(fd) != 0) { FTI_Print("FTI checkpoint file could not be flushed.", FTI_EROR); + return FTI_NSCS; } sprintf(str, "Time writing checkpoint file : %f seconds.", MPI_Wtime() - tt); FTI_Print(str, FTI_DBUG); int globalTmp = (FTI_Ckpt[4].isInline && FTI_Exec.ckptLvel == 4) ? 1 : 0; res = FTI_Try(FTI_CreateMetadata(globalTmp), "create metadata."); + return res; } From c4b09c666fe284b5a97ce8074062d5ac1531ba98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Thu, 25 Feb 2016 14:08:54 +0100 Subject: [PATCH 37/93] Fixing resource leak --- src/postreco.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/postreco.c b/src/postreco.c index d6e76365a..7ec118208 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -490,6 +490,8 @@ int FTI_RecoverL2(int group) fclose(lfd); if (pfd) fclose(pfd); + if (qfd) + fclose(qfd); free(blBuf1); free(blBuf2); @@ -507,6 +509,8 @@ int FTI_RecoverL2(int group) fclose(lfd); if (pfd) fclose(pfd); + if (qfd) + fclose(qfd); free(blBuf1); free(blBuf2); @@ -528,6 +532,8 @@ int FTI_RecoverL2(int group) fclose(jfd); if (pfd) fclose(pfd); + if (qfd) + fclose(qfd); free(blBuf1); free(blBuf2); @@ -543,6 +549,8 @@ int FTI_RecoverL2(int group) fclose(jfd); if (pfd) fclose(pfd); + if (qfd) + fclose(qfd); free(blBuf1); free(blBuf2); @@ -557,6 +565,8 @@ int FTI_RecoverL2(int group) if (pfd) fclose(pfd); + if (qfd) + fclose(qfd); free(blBuf1); free(blBuf2); @@ -570,6 +580,8 @@ int FTI_RecoverL2(int group) if (pfd) fclose(pfd); + if (qfd) + fclose(qfd); free(blBuf1); free(blBuf2); @@ -584,6 +596,9 @@ int FTI_RecoverL2(int group) if (fclose(pfd) != 0) { FTI_Print("R2 cannot close the partner ckpt. file", FTI_DBUG); + if (qfd) + fclose(qfd); + free(blBuf1); free(blBuf2); free(blBuf3); From 50099fcc583f15b5fc95b087f81c17abfe3c8fa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Thu, 25 Feb 2016 14:12:56 +0100 Subject: [PATCH 38/93] Fixing bugs introduced during earlier fixing --- src/topo.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/topo.c b/src/topo.c index 7eb9e0afe..3a2b1ebe0 100644 --- a/src/topo.c +++ b/src/topo.c @@ -332,30 +332,38 @@ int FTI_Topology() { int res, nn, found, c1 = 0, c2 = 0, p, i, mypos = -1, posInNode; char str[FTI_BUFS], *nameList = talloc(char, FTI_Topo.nbNodes *FTI_BUFS); + int* nodeList = talloc(int, FTI_Topo.nbNodes* FTI_Topo.nodeSize); for (i = 0; i < FTI_Topo.nbProc; i++) { nodeList[i] = -1; } + res = FTI_Try(FTI_BuildNodeList(nodeList, nameList), "create node list."); if (res == FTI_NSCS) { free(nameList); free(nodeList); + return FTI_NSCS; } + if (FTI_Exec.reco > 0) { res = FTI_Try(FTI_ReorderNodes(nodeList, nameList), "reorder nodes."); if (res == FTI_NSCS) { free(nameList); free(nodeList); + return FTI_NSCS; } } - MPI_Barrier(FTI_Exec.globalComm); // Need to synchronize before editing topology file + + // Need to synchronize before editing topology file + MPI_Barrier(FTI_Exec.globalComm); if (FTI_Topo.myRank == 0 && FTI_Exec.reco == 0) { res = FTI_Try(FTI_SaveTopo(nameList), "save topology."); if (res == FTI_NSCS) { free(nameList); free(nodeList); + return FTI_NSCS; } } @@ -372,8 +380,14 @@ int FTI_Topology() c2++; } } - if (mypos == -1) + if (mypos == -1) { + free(userProcList); + free(distProcList); + free(nameList); + free(nodeList); + return FTI_NSCS; + } FTI_Topo.nodeRank = mypos % FTI_Topo.nodeSize; if (FTI_Topo.nodeRank == 0 && FTI_Topo.nbHeads == 1) { @@ -390,17 +404,21 @@ int FTI_Topology() for (i = 0; i < FTI_Topo.nbNodes; i++) { distProcList[i] = nodeList[(FTI_Topo.nodeSize * i) + posInNode]; } + res = FTI_Try(FTI_CreateComms(userProcList, distProcList, nodeList), "create communicators."); if (res == FTI_NSCS) { free(userProcList); free(distProcList); free(nameList); free(nodeList); + return FTI_NSCS; } + free(userProcList); free(distProcList); free(nameList); free(nodeList); + return FTI_SCES; } From 89b921a68e4c0868ad99108f98e1f41749836e6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Fri, 26 Feb 2016 11:55:13 +0100 Subject: [PATCH 39/93] Fixing unchecked return value from rename function --- src/api.c | 93 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 53 insertions(+), 40 deletions(-) diff --git a/src/api.c b/src/api.c index 1245a869f..3c7ad7fee 100644 --- a/src/api.c +++ b/src/api.c @@ -453,53 +453,66 @@ int FTI_Snapshot() /*-------------------------------------------------------------------------*/ int FTI_Finalize() { - if (!FTI_Topo.amIaHead) { - int buff = FTI_ENDW; - MPI_Status status; - if (FTI_Exec.wasLastOffline == 1) { // If there is remaining work to do for last checkpoint - MPI_Recv(&buff, 1, MPI_INT, FTI_Topo.headRank, FTI_Conf.tag, FTI_Exec.globalComm, &status); - if (buff != FTI_NSCS) { - FTI_Exec.ckptLvel = buff; - FTI_Exec.wasLastOffline = 1; - FTI_Exec.lastCkptLvel = FTI_Exec.ckptLvel; - } - } - buff = FTI_ENDW; - if (FTI_Topo.nbHeads == 1) { // Send notice to the head to stop listening - MPI_Send(&buff, 1, MPI_INT, FTI_Topo.headRank, FTI_Conf.tag, FTI_Exec.globalComm); + if (FTI_Topo.amIaHead) { + MPI_Barrier(FTI_Exec.globalComm); + MPI_Finalize(); + exit(0); + } + + // Not FTI_Topo.amIaHead + int buff = FTI_ENDW; + MPI_Status status; + + // If there is remaining work to do for last checkpoint + if (FTI_Exec.wasLastOffline == 1) { + MPI_Recv(&buff, 1, MPI_INT, FTI_Topo.headRank, FTI_Conf.tag, + FTI_Exec.globalComm, &status); + if (buff != FTI_NSCS) { + FTI_Exec.ckptLvel = buff; + FTI_Exec.wasLastOffline = 1; + FTI_Exec.lastCkptLvel = FTI_Exec.ckptLvel; } - if (FTI_Conf.saveLastCkpt) { // If we need to keep the last checkpoint - if (FTI_Exec.lastCkptLvel != 4) { - FTI_Try(FTI_Flush(FTI_Topo.groupID, FTI_Exec.lastCkptLvel), "save the last ckpt. in the PFS."); - MPI_Barrier(FTI_COMM_WORLD); - if (FTI_Topo.splitRank == 0) { - if (access(FTI_Ckpt[4].dir, 0) == 0) - FTI_RmDir(FTI_Ckpt[4].dir, 1); - if (access(FTI_Ckpt[4].metaDir, 0) == 0) - FTI_RmDir(FTI_Ckpt[4].metaDir, 1); - rename(FTI_Ckpt[FTI_Exec.lastCkptLvel].metaDir, FTI_Ckpt[4].metaDir); - rename(FTI_Conf.gTmpDir, FTI_Ckpt[4].dir); - } - } + } + buff = FTI_ENDW; + + // Send notice to the head to stop listening + if (FTI_Topo.nbHeads == 1) { + MPI_Send(&buff, 1, MPI_INT, FTI_Topo.headRank, FTI_Conf.tag, FTI_Exec.globalComm); + } + + // If we need to keep the last checkpoint + if (FTI_Conf.saveLastCkpt) { + if (FTI_Exec.lastCkptLvel != 4) { + FTI_Try(FTI_Flush(FTI_Topo.groupID, FTI_Exec.lastCkptLvel), "save the last ckpt. in the PFS."); + MPI_Barrier(FTI_COMM_WORLD); if (FTI_Topo.splitRank == 0) { - FTI_Try(FTI_UpdateConf(2), "update configuration file to 2."); + if (access(FTI_Ckpt[4].dir, 0) == 0) + FTI_RmDir(FTI_Ckpt[4].dir, 1); + if (access(FTI_Ckpt[4].metaDir, 0) == 0) + FTI_RmDir(FTI_Ckpt[4].metaDir, 1); + + if (rename(FTI_Ckpt[FTI_Exec.lastCkptLvel].metaDir, FTI_Ckpt[4].metaDir) == -1) + FTI_Print("cannot save last ckpt. metaDir", FTI_EROR); + if (rename(FTI_Conf.gTmpDir, FTI_Ckpt[4].dir) == -1) + FTI_Print("cannot save last ckpt. dir", FTI_EROR); } - buff = 6; // For cleaning only local storage } - else { - if (FTI_Topo.splitRank == 0) { - FTI_Try(FTI_UpdateConf(0), "update configuration file to 0."); - } - buff = 5; // For cleaning everything + if (FTI_Topo.splitRank == 0) { + FTI_Try(FTI_UpdateConf(2), "update configuration file to 2."); } - MPI_Barrier(FTI_Exec.globalComm); - FTI_Try(FTI_Clean(buff, FTI_Topo.groupID, FTI_Topo.myRank), "do final clean."); - FTI_Print("FTI has been finalized.", FTI_INFO); + buff = 6; // For cleaning only local storage } else { - MPI_Barrier(FTI_Exec.globalComm); - MPI_Finalize(); - exit(0); + if (FTI_Topo.splitRank == 0) { + FTI_Try(FTI_UpdateConf(0), "update configuration file to 0."); + } + buff = 5; // For cleaning everything } + MPI_Barrier(FTI_Exec.globalComm); + FTI_Try(FTI_Clean(buff, FTI_Topo.groupID, FTI_Topo.myRank), "do final clean."); + FTI_Print("FTI has been finalized.", FTI_INFO); + return FTI_SCES; } + + From 3014805ef4ad681969a464aec2a95fdf48aa000b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Fri, 26 Feb 2016 12:01:26 +0100 Subject: [PATCH 40/93] Fixing unchecked return value from rename function --- src/checkpoint.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/checkpoint.c b/src/checkpoint.c index 6bcef3b21..a1d8c78b3 100644 --- a/src/checkpoint.c +++ b/src/checkpoint.c @@ -171,14 +171,18 @@ int FTI_PostCkpt(int group, int fo, int pr) int i, tres, res, level, nodeFlag, globalFlag = FTI_Topo.splitRank; double t0, t1, t2, t3; char str[FTI_BUFS]; + t0 = MPI_Wtime(); + res = (FTI_Exec.ckptLvel == (FTI_REJW - FTI_BASE)) ? FTI_NSCS : FTI_SCES; MPI_Allreduce(&res, &tres, 1, MPI_INT, MPI_SUM, FTI_COMM_WORLD); if (tres != FTI_SCES) { FTI_GroupClean(0, group, pr); return FTI_NSCS; } + t1 = MPI_Wtime(); + for (i = 0; i < pr; i++) { switch (FTI_Exec.ckptLvel) { case 4: @@ -200,22 +204,30 @@ int FTI_PostCkpt(int group, int fo, int pr) FTI_GroupClean(0, group, pr); return FTI_NSCS; } + t2 = MPI_Wtime(); + FTI_GroupClean(FTI_Exec.ckptLvel, group, pr); MPI_Barrier(FTI_COMM_WORLD); nodeFlag = (((!FTI_Topo.amIaHead) && (FTI_Topo.nodeRank == 0)) || (FTI_Topo.amIaHead)) ? 1 : 0; if (nodeFlag) { level = (FTI_Exec.ckptLvel != 4) ? FTI_Exec.ckptLvel : 1; - rename(FTI_Conf.lTmpDir, FTI_Ckpt[level].dir); - FTI_Print("Local directory renamed", FTI_DBUG); + if (rename(FTI_Conf.lTmpDir, FTI_Ckpt[level].dir) == -1) + FTI_Print("Cannot rename local directory", FTI_EROR); + else + FTI_Print("Local directory renamed", FTI_DBUG); } if (!globalFlag) { if (FTI_Exec.ckptLvel == 4) { - rename(FTI_Conf.gTmpDir, FTI_Ckpt[FTI_Exec.ckptLvel].dir); + if (rename(FTI_Conf.gTmpDir, FTI_Ckpt[FTI_Exec.ckptLvel].dir) == -1) + FTI_Print("Cannot rename global directory", FTI_EROR); } - rename(FTI_Conf.mTmpDir, FTI_Ckpt[FTI_Exec.ckptLvel].metaDir); + if (rename(FTI_Conf.mTmpDir, FTI_Ckpt[FTI_Exec.ckptLvel].metaDir) == -1) + FTI_Print("Cannot rename meta directory", FTI_EROR); } + t3 = MPI_Wtime(); + sprintf(str, "Post-checkpoint took %.2f sec.", t3 - t0); sprintf(str, "%s (Ag:%.2fs, Pt:%.2fs, Cl:%.2fs)", str, t1 - t0, t2 - t1, t3 - t2); FTI_Print(str, FTI_INFO); From 8bf91dfce12b1b596feeb8bcd565801cfa348943 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Fri, 26 Feb 2016 13:18:15 +0100 Subject: [PATCH 41/93] Fixing unchecked return value from mkdir function --- src/checkpoint.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/checkpoint.c b/src/checkpoint.c index a1d8c78b3..8536fe7bd 100644 --- a/src/checkpoint.c +++ b/src/checkpoint.c @@ -70,17 +70,20 @@ int FTI_WriteCkpt(FTIT_dataset* FTI_Data) { int i, res; FILE* fd; - double tt = MPI_Wtime(); char fn[FTI_BUFS], str[FTI_BUFS]; + double tt = MPI_Wtime(); + snprintf(FTI_Exec.ckptFile, FTI_BUFS, "Ckpt%d-Rank%d.fti", FTI_Exec.ckptID, FTI_Topo.myRank); if (FTI_Ckpt[4].isInline && FTI_Exec.ckptLvel == 4) { sprintf(fn, "%s/%s", FTI_Conf.gTmpDir, FTI_Exec.ckptFile); - mkdir(FTI_Conf.gTmpDir, 0777); + if (mkdir(FTI_Conf.gTmpDir, 0777) == -1) + FTI_Print("Cannot create global directory", FTI_EROR); } else { sprintf(fn, "%s/%s", FTI_Conf.lTmpDir, FTI_Exec.ckptFile); - mkdir(FTI_Conf.lTmpDir, 0777); + if (mkdir(FTI_Conf.lTmpDir, 0777) == -1) + FTI_Print("Cannot create local directory", FTI_EROR); } fd = fopen(fn, "wb"); From 030a37f65a1bd794a1485cd3100149535dbea36e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Fri, 26 Feb 2016 13:27:44 +0100 Subject: [PATCH 42/93] Fixing unchecked return value from mkdir function --- src/conf.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/conf.c b/src/conf.c index eb448f220..dcf6217fc 100644 --- a/src/conf.c +++ b/src/conf.c @@ -311,7 +311,8 @@ int FTI_CreateDirs() // Create metadata timestamp directory snprintf(fn, FTI_BUFS, "%s/%s", FTI_Conf.metadDir, FTI_Exec.id); if (access(fn, F_OK) != 0) { - mkdir(fn, 0777); + if (mkdir(fn, 0777) == -1) + FTI_Print("Cannot create metadata timestamp directory", FTI_EROR); } snprintf(FTI_Conf.metadDir, FTI_BUFS, "%s", fn); snprintf(FTI_Conf.mTmpDir, FTI_BUFS, "%s/tmp", fn); @@ -324,7 +325,8 @@ int FTI_CreateDirs() snprintf(fn, FTI_BUFS, "%s", FTI_Conf.glbalDir); snprintf(FTI_Conf.glbalDir, FTI_BUFS, "%s/%s", fn, FTI_Exec.id); if (access(FTI_Conf.glbalDir, F_OK) != 0) { - mkdir(FTI_Conf.glbalDir, 0777); + if (mkdir(FTI_Conf.glbalDir, 0777) == -1) + FTI_Print("Cannot create global checkpoint timestamp directory", FTI_EROR); } snprintf(FTI_Conf.gTmpDir, FTI_BUFS, "%s/tmp", FTI_Conf.glbalDir); snprintf(FTI_Ckpt[4].dir, FTI_BUFS, "%s/l4", FTI_Conf.glbalDir); @@ -333,7 +335,8 @@ int FTI_CreateDirs() if (FTI_Conf.test) { // If local test generate name by topology snprintf(fn, FTI_BUFS, "%s/node%d", FTI_Conf.localDir, FTI_Topo.myRank / FTI_Topo.nodeSize); if (access(fn, F_OK) != 0) { - mkdir(fn, 0777); + if (mkdir(fn, 0777) == -1) + FTI_Print("Cannot create local checkpoint timestamp directory", FTI_EROR); } } else { @@ -341,12 +344,14 @@ int FTI_CreateDirs() } snprintf(FTI_Conf.localDir, FTI_BUFS, "%s/%s", fn, FTI_Exec.id); if (access(FTI_Conf.localDir, F_OK) != 0) { - mkdir(FTI_Conf.localDir, 0777); + if (mkdir(FTI_Conf.localDir, 0777) == -1) + FTI_Print("Cannot create local checkpoint timestamp directory", FTI_EROR); } snprintf(FTI_Conf.lTmpDir, FTI_BUFS, "%s/tmp", FTI_Conf.localDir); snprintf(FTI_Ckpt[1].dir, FTI_BUFS, "%s/l1", FTI_Conf.localDir); snprintf(FTI_Ckpt[2].dir, FTI_BUFS, "%s/l2", FTI_Conf.localDir); snprintf(FTI_Ckpt[3].dir, FTI_BUFS, "%s/l3", FTI_Conf.localDir); + return FTI_SCES; } From 10e561320d8c8f245f4c2af0267bdb03abf4f882 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Fri, 26 Feb 2016 13:51:54 +0100 Subject: [PATCH 43/93] Fixing unchecked return value from remove function --- src/tools.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/src/tools.c b/src/tools.c index df9c12ac8..23c7d4304 100644 --- a/src/tools.c +++ b/src/tools.c @@ -171,40 +171,56 @@ int FTI_Clean(int level, int group, int rank) { char buf[FTI_BUFS]; int nodeFlag, globalFlag = !FTI_Topo.splitRank; + nodeFlag = (((!FTI_Topo.amIaHead) && (FTI_Topo.nodeRank == 0)) || (FTI_Topo.amIaHead)) ? 1 : 0; + if (level == 0) { FTI_RmDir(FTI_Conf.mTmpDir, globalFlag); FTI_RmDir(FTI_Conf.gTmpDir, globalFlag); FTI_RmDir(FTI_Conf.lTmpDir, nodeFlag); } - if (level >= 1) { // Clean last checkpoint level 1 + + // Clean last checkpoint level 1 + if (level >= 1) { FTI_RmDir(FTI_Ckpt[1].metaDir, globalFlag); FTI_RmDir(FTI_Ckpt[1].dir, nodeFlag); } - if (level >= 2) { // Clean last checkpoint level 2 + + // Clean last checkpoint level 2 + if (level >= 2) { FTI_RmDir(FTI_Ckpt[2].metaDir, globalFlag); FTI_RmDir(FTI_Ckpt[2].dir, nodeFlag); } - if (level >= 3) { // Clean last checkpoint level 3 + + // Clean last checkpoint level 3 + if (level >= 3) { FTI_RmDir(FTI_Ckpt[3].metaDir, globalFlag); FTI_RmDir(FTI_Ckpt[3].dir, nodeFlag); } - if (level == 4 || level == 5) { // Clean last checkpoint level 4 + + // Clean last checkpoint level 4 + if (level == 4 || level == 5) { FTI_RmDir(FTI_Ckpt[4].metaDir, globalFlag); FTI_RmDir(FTI_Ckpt[4].dir, globalFlag); rmdir(FTI_Conf.gTmpDir); } - if (level == 5) { // If it is the very last cleaning and we DO NOT keep the last checkpoint + + // If it is the very last cleaning and we DO NOT keep the last checkpoint + if (level == 5) { rmdir(FTI_Conf.lTmpDir); rmdir(FTI_Conf.localDir); rmdir(FTI_Conf.glbalDir); snprintf(buf, FTI_BUFS, "%s/Topology.fti", FTI_Conf.metadDir); - remove(buf); + if (remove(buf) == -1) + FTI_Print("Cannot remove Topology.fti", FTI_EROR); rmdir(FTI_Conf.metadDir); } - if (level == 6) { // If it is the very last cleaning and we DO keep the last checkpoint + + // If it is the very last cleaning and we DO keep the last checkpoint + if (level == 6) { rmdir(FTI_Conf.lTmpDir); rmdir(FTI_Conf.localDir); } + return FTI_SCES; } From e8fbc5e84b3d7da3f6facba9b429e3c7ab0b9213 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Fri, 26 Feb 2016 13:53:43 +0100 Subject: [PATCH 44/93] Fixing unchecked return value from mkdir function --- src/meta.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/meta.c b/src/meta.c index 390217da7..7bbb99c00 100644 --- a/src/meta.c +++ b/src/meta.c @@ -109,7 +109,8 @@ int FTI_WriteMetadata(unsigned long* fs, unsigned long mfs, char* fnl) // Remove topology section iniparser_unset(ini, "topology"); if (access(FTI_Conf.mTmpDir, F_OK) != 0) { - mkdir(FTI_Conf.mTmpDir, 0777); + if (mkdir(FTI_Conf.mTmpDir, 0777) == -1) + FTI_Print("Cannot create directory", FTI_EROR); } sprintf(buf, "%s/sector%d-group%d.fti", FTI_Conf.mTmpDir, FTI_Topo.sectorID, FTI_Topo.groupID); From 4d23237cca565738fd0a568b58ad7a2af9c165d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Fri, 26 Feb 2016 13:55:50 +0100 Subject: [PATCH 45/93] Fixing unchecked return value from remove function --- src/meta.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/meta.c b/src/meta.c index 7bbb99c00..389567246 100644 --- a/src/meta.c +++ b/src/meta.c @@ -114,7 +114,9 @@ int FTI_WriteMetadata(unsigned long* fs, unsigned long mfs, char* fnl) } sprintf(buf, "%s/sector%d-group%d.fti", FTI_Conf.mTmpDir, FTI_Topo.sectorID, FTI_Topo.groupID); - remove(buf); + if (remove(buf) == -1) + FTI_Print("Cannot remove sector-group.fti", FTI_EROR); + sprintf(str, "Creating metadata file (%s)...", buf); FTI_Print(str, FTI_DBUG); From e13f6b83a7c26f82ac7b879b1f386e10030402b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Fri, 26 Feb 2016 13:57:44 +0100 Subject: [PATCH 46/93] Fixing unchecked return value from mkdir function --- src/postckpt.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/postckpt.c b/src/postckpt.c index a93564209..dbe3e84a0 100644 --- a/src/postckpt.c +++ b/src/postckpt.c @@ -247,8 +247,10 @@ int FTI_Flush(int group, int level) return FTI_NSCS; if (access(FTI_Conf.gTmpDir, F_OK) != 0) { - mkdir(FTI_Conf.gTmpDir, 0777); + if (mkdir(FTI_Conf.gTmpDir, 0777) == -1) + FTI_Print("Cannot create directory", FTI_EROR); } + ps = (maxFs / FTI_Conf.blockSize) * FTI_Conf.blockSize; if (ps < maxFs) ps = ps + FTI_Conf.blockSize; @@ -271,19 +273,23 @@ int FTI_Flush(int group, int level) FTI_Print(str, FTI_DBUG); if (access(lfn, R_OK) != 0) { FTI_Print("L4 cannot access the checkpoint file.", FTI_EROR); + return FTI_NSCS; } lfd = fopen(lfn, "rb"); if (lfd == NULL) { FTI_Print("L4 cannot open the checkpoint file.", FTI_EROR); + return FTI_NSCS; } gfd = fopen(gfn, "wb"); if (gfd == NULL) { FTI_Print("L4 cannot open ckpt. file in the PFS.", FTI_EROR); + fclose(lfd); + return FTI_NSCS; } @@ -302,5 +308,6 @@ int FTI_Flush(int group, int level) fclose(lfd); fclose(gfd); + return FTI_SCES; } From ae1bab980e7846b5c417dcd0437fae808643f280 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Fri, 26 Feb 2016 14:00:42 +0100 Subject: [PATCH 47/93] Fixing unchecked return value from mkdir function --- src/postreco.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/postreco.c b/src/postreco.c index 7ec118208..4022bcc5b 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -297,7 +297,9 @@ int FTI_RecoverL2(int group) dest = FTI_Topo.right; if (access(FTI_Ckpt[2].dir, F_OK) != 0) - mkdir(FTI_Ckpt[2].dir, 0777); + if (mkdir(FTI_Ckpt[2].dir, 0777) == -1) + FTI_Print("Cannot create directory", FTI_EROR); + // Checking erasures if (FTI_CheckErasures(&fs, &maxFs, group, erased, 2) != FTI_SCES) { FTI_Print("Error checking erasures.", FTI_DBUG); From a65985ab4b305f5e5008f268dcd6ef7470c0ae5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Fri, 26 Feb 2016 14:02:25 +0100 Subject: [PATCH 48/93] Fixing unchecked return value from mkdir function --- src/postreco.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/postreco.c b/src/postreco.c index 4022bcc5b..492a69206 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -31,8 +31,9 @@ int FTI_Decode(int fs, int maxFs, int* erased) ps = ps + FTI_Conf.blockSize; // Calculating padding size if (access(FTI_Ckpt[3].dir, F_OK) != 0) - mkdir(FTI_Ckpt[3].dir, 0777); - + if (mkdir(FTI_Ckpt[3].dir, 0777) == -1) + FTI_Print("Cannot create directory", FTI_EROR); + sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &i); sprintf(fn, "%s/%s", FTI_Ckpt[3].dir, FTI_Exec.ckptFile); sprintf(efn, "%s/Ckpt%d-RSed%d.fti", FTI_Ckpt[3].dir, FTI_Exec.ckptID, i); From b47eec47a51e95dd361d18d60004e05e60f1e3d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Fri, 26 Feb 2016 14:07:00 +0100 Subject: [PATCH 49/93] Fixing unchecked return value from mkdir function --- src/postckpt.c | 2 +- src/postreco.c | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/postckpt.c b/src/postckpt.c index dbe3e84a0..020573047 100644 --- a/src/postckpt.c +++ b/src/postckpt.c @@ -250,7 +250,7 @@ int FTI_Flush(int group, int level) if (mkdir(FTI_Conf.gTmpDir, 0777) == -1) FTI_Print("Cannot create directory", FTI_EROR); } - + ps = (maxFs / FTI_Conf.blockSize) * FTI_Conf.blockSize; if (ps < maxFs) ps = ps + FTI_Conf.blockSize; diff --git a/src/postreco.c b/src/postreco.c index 492a69206..787d44fcf 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -33,7 +33,7 @@ int FTI_Decode(int fs, int maxFs, int* erased) if (access(FTI_Ckpt[3].dir, F_OK) != 0) if (mkdir(FTI_Ckpt[3].dir, 0777) == -1) FTI_Print("Cannot create directory", FTI_EROR); - + sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &i); sprintf(fn, "%s/%s", FTI_Ckpt[3].dir, FTI_Exec.ckptFile); sprintf(efn, "%s/Ckpt%d-RSed%d.fti", FTI_Ckpt[3].dir, FTI_Exec.ckptID, i); @@ -674,24 +674,31 @@ int FTI_RecoverL3(int group) unsigned long fs, maxFs; char str[FTI_BUFS]; gs = FTI_Topo.groupSize; + if (access(FTI_Ckpt[3].dir, F_OK) != 0) - mkdir(FTI_Ckpt[3].dir, 0777); - if (FTI_CheckErasures(&fs, &maxFs, group, erased, 3) != FTI_SCES) // Checking erasures - { + if (mkdir(FTI_Ckpt[3].dir, 0777) == -1) + FTI_Print("Cannot create directory", FTI_EROR); + + // Checking erasures + if (FTI_CheckErasures(&fs, &maxFs, group, erased, 3) != FTI_SCES) { FTI_Print("Error checking erasures.", FTI_DBUG); return FTI_NSCS; } + + // Counting erasures l = 0; for (j = 0; j < gs; j++) { if (erased[j]) l++; if (erased[j + gs]) l++; - } // Counting erasures + } if (l > gs) { FTI_Print("Too many erasures at L3.", FTI_DBUG); return FTI_NSCS; } + + // Reed-Solomon decoding if (l > 0) { sprintf(str, "There are %d encoded/checkpoint files missing in this group.", l); FTI_Print(str, FTI_DBUG); @@ -699,7 +706,8 @@ int FTI_RecoverL3(int group) FTI_Print("RS-decoding could not regenerate the missing data.", FTI_DBUG); return FTI_NSCS; } - } // Reed-Solomon decoding + } + return FTI_SCES; } From 1b0e1b7f5429a8038bf505adf061948be3c584a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 1 Mar 2016 15:32:34 +0100 Subject: [PATCH 50/93] Fixing possible 'not null terminated string' bug --- src/topo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/topo.c b/src/topo.c index 3a2b1ebe0..500d8f887 100644 --- a/src/topo.c +++ b/src/topo.c @@ -40,7 +40,7 @@ int FTI_SaveTopo(char* nameList) // Write list of nodes for (i = 0; i < FTI_Topo.nbNodes; i++) { - strncpy(mfn, nameList + (i * FTI_BUFS), FTI_BUFS); + strncpy(mfn, nameList + (i * FTI_BUFS), FTI_BUFS - 1); sprintf(str, "topology:%d", i); iniparser_set(ini, str, mfn); } From 97f808a9e1f2629e2c7758859db79b5e9a2692b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 1 Mar 2016 15:35:37 +0100 Subject: [PATCH 51/93] Fixing possible 'not null terminated string' bug --- src/topo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/topo.c b/src/topo.c index 500d8f887..314b34bc4 100644 --- a/src/topo.c +++ b/src/topo.c @@ -224,7 +224,7 @@ int FTI_BuildNodeList(int* nodeList, char* nameList) for (i = 0; i < FTI_Topo.nbProc; i++) { // Creating the node list: For each process found = 0; pos = 0; - strncpy(hname, lhn + (i * FTI_BUFS), FTI_BUFS); // Get node name of process i + strncpy(hname, lhn + (i * FTI_BUFS), FTI_BUFS - 1); // Get node name of process i while ((pos < nbNodes) && (found == 0)) { // Search the node name in the current list of node names if (strncmp(&(nameList[pos * FTI_BUFS]), hname, FTI_BUFS) == 0) { // If we find it break out found = 1; From dff426137c3a61cf5ac0341b04a9fdd84f9cf663 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 1 Mar 2016 15:36:13 +0100 Subject: [PATCH 52/93] Fixing possible 'not null terminated string' bug --- src/meta.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/meta.c b/src/meta.c index 389567246..e8dce17bb 100644 --- a/src/meta.c +++ b/src/meta.c @@ -190,7 +190,7 @@ int FTI_CreateMetadata(int globalTmp) sprintf(fnl + (FTI_Topo.groupRank * FTI_BUFS), "%s", FTI_Exec.ckptFile); tmpo = fs[FTI_Topo.groupRank]; // Gather all the file sizes MPI_Allgather(&tmpo, 1, MPI_UNSIGNED_LONG, fs, 1, MPI_UNSIGNED_LONG, FTI_Exec.groupComm); - strncpy(str, fnl + (FTI_Topo.groupRank * FTI_BUFS), FTI_BUFS); // Gather all the file names + strncpy(str, fnl + (FTI_Topo.groupRank * FTI_BUFS), FTI_BUFS - 1); // Gather all the file names MPI_Allgather(str, FTI_BUFS, MPI_CHAR, fnl, FTI_BUFS, MPI_CHAR, FTI_Exec.groupComm); mfs = 0; for (i = 0; i < FTI_Topo.groupSize; i++) { From fcc2ee727701eb56b0db48df15d6a73f0e07fd44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 1 Mar 2016 15:36:49 +0100 Subject: [PATCH 53/93] Fixing possible 'not null terminated string' bug --- src/meta.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/meta.c b/src/meta.c index e8dce17bb..0bad5ebe7 100644 --- a/src/meta.c +++ b/src/meta.c @@ -93,7 +93,7 @@ int FTI_WriteMetadata(unsigned long* fs, unsigned long mfs, char* fnl) // Add metadata to dictionary for (i = 0; i < FTI_Topo.groupSize; i++) { - strncpy(buf, fnl + (i * FTI_BUFS), FTI_BUFS); + strncpy(buf, fnl + (i * FTI_BUFS), FTI_BUFS - 1); sprintf(str, "%d", i); iniparser_set(ini, str, NULL); sprintf(str, "%d:Ckpt_file_name", i); From 08b0514e7c20763e1c0cb2c3820de809579cf01d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 1 Mar 2016 15:52:25 +0100 Subject: [PATCH 54/93] Fixing possible 'division by zero' --- src/conf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/conf.c b/src/conf.c index dcf6217fc..c19dc5e33 100644 --- a/src/conf.c +++ b/src/conf.c @@ -165,7 +165,7 @@ int FTI_ReadConf(FTIT_injection* FTI_Inje) FTI_Topo.groupSize = (int)iniparser_getint(ini, "Basic:group_size", -1); FTI_Topo.nodeSize = (int)iniparser_getint(ini, "Basic:node_size", -1); FTI_Topo.nbApprocs = FTI_Topo.nodeSize - FTI_Topo.nbHeads; - FTI_Topo.nbNodes = FTI_Topo.nbProc / FTI_Topo.nodeSize; + FTI_Topo.nbNodes = (FTI_Topo.nodeSize) ? FTI_Topo.nbProc / FTI_Topo.nodeSize : 0; // Reading/setting injection parameters FTI_Inje->rank = (int)iniparser_getint(ini, "Injection:rank", 0); From 407a077b0df92400a203b70f70fce381bd9be1c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 1 Mar 2016 16:19:33 +0100 Subject: [PATCH 55/93] Fixing 'ignoring number of bytes read' --- src/postreco.c | 47 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/src/postreco.c b/src/postreco.c index 787d44fcf..39e450e26 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -743,25 +743,30 @@ int FTI_RecoverL4(int group) } } MPI_Barrier(FTI_COMM_WORLD); - if (FTI_CheckErasures(&fs, &maxFs, group, erased, 4) != FTI_SCES) // Checking erasures - { + // Checking erasures + if (FTI_CheckErasures(&fs, &maxFs, group, erased, 4) != FTI_SCES) { FTI_Print("Error checking erasures.", FTI_DBUG); return FTI_NSCS; } + l = 0; + // Counting erasures for (j = 0; j < gs; j++) { if (erased[j]) l++; - } // Counting erasures + } if (l > 0) { FTI_Print("Checkpoint file missing at L4.", FTI_DBUG); return FTI_NSCS; } + ps = (fs / FTI_Conf.blockSize) * FTI_Conf.blockSize; pos = 0; // For the logic + // Calculating padding size if (ps < fs) - ps = ps + FTI_Conf.blockSize; // Calculating padding size - sprintf(gfn, "%s/%s", FTI_Ckpt[4].dir, FTI_Exec.ckptFile); // Open and resize files + ps = ps + FTI_Conf.blockSize; + // Open and resize files + sprintf(gfn, "%s/%s", FTI_Ckpt[4].dir, FTI_Exec.ckptFile); sprintf(lfn, "%s/%s", FTI_Ckpt[1].dir, FTI_Exec.ckptFile); if (access(gfn, R_OK) != 0) { FTI_Print("R4 cannot read the checkpoint file in the PFS.", FTI_DBUG); @@ -786,15 +791,39 @@ int FTI_RecoverL4(int group) } char *blBuf1 = talloc(char, FTI_Conf.blockSize); - while (pos < ps) { // Checkpoint files transfer from PFS - fread(blBuf1, sizeof(char), FTI_Conf.blockSize, gfd); - fwrite(blBuf1, sizeof(char), FTI_Conf.blockSize, lfd); + // Checkpoint files transfer from PFS + while (pos < ps) { + size_t bytes = fread(blBuf1, sizeof(char), FTI_Conf.blockSize, gfd); + if (ferror(gfd)) { + FTI_Print("R4 cannot read from the ckpt. file in the PFS.", FTI_DBUG); + + free(blBuf1); + + fclose(gfd); + fclose(lfd); + + return FTI_NSCS; + } + + fwrite(blBuf1, sizeof(char), bytes, lfd); + if (ferror(lfd)) { + FTI_Print("R4 cannot write to the local ckpt. file.", FTI_DBUG); + + free(blBuf1); + + fclose(gfd); + fclose(lfd); + + return FTI_NSCS; + } + pos = pos + FTI_Conf.blockSize; } + free(blBuf1); fclose(gfd); - fclose(lfd); // Close files + fclose(lfd); if (truncate(gfn, fs) == -1) { FTI_Print("R4 cannot re-truncate the checkpoint file in the PFS.", FTI_DBUG); From e4e9b4c1f93b93a4a04af2cb42a4700b2766d2e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 1 Mar 2016 16:28:55 +0100 Subject: [PATCH 56/93] Fixing 'ignoring number of bytes read' --- src/postreco.c | 49 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/src/postreco.c b/src/postreco.c index 39e450e26..af1cd601f 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -154,36 +154,71 @@ int FTI_Decode(int fs, int maxFs, int* erased) return FTI_NSCS; } - while (pos < ps) { // Main loop, block by block - if (erased[FTI_Topo.groupRank] == 0) { // Reading the data - fread(data[FTI_Topo.groupRank] + 0, sizeof(char), bs, fd); - fread(coding[FTI_Topo.groupRank] + 0, sizeof(char), bs, efd); + + // Main loop, block by block + while (pos < ps) { + // Reading the data + if (erased[FTI_Topo.groupRank] == 0) { + (void)fread(data[FTI_Topo.groupRank] + 0, sizeof(char), bs, fd); + (void)fread(coding[FTI_Topo.groupRank] + 0, sizeof(char), bs, efd); + + if (ferror(fd) || ferror(efd)) { + FTI_Print("R3 cannot from the ckpt. file or the encoded ckpt. file.", FTI_DBUG); + + fclose(fd); + fclose(efd); + + for (i = 0; i < m; i++) { + free(coding[i]); + free(data[i]); + } + free(tmpmat); + free(dm_ids); + free(decMatrix); + free(matrix); + free(data); + free(dataTmp); + free(coding); + + return FTI_NSCS; + } } else { bzero(data[FTI_Topo.groupRank], bs); bzero(coding[FTI_Topo.groupRank], bs); } // Erasure found + MPI_Allgather(data[FTI_Topo.groupRank] + 0, bs, MPI_CHAR, dataTmp, bs, MPI_CHAR, FTI_Exec.groupComm); for (i = 0; i < k; i++) memcpy(data[i] + 0, &(dataTmp[i * bs]), sizeof(char) * bs); + MPI_Allgather(coding[FTI_Topo.groupRank] + 0, bs, MPI_CHAR, dataTmp, bs, MPI_CHAR, FTI_Exec.groupComm); for (i = 0; i < k; i++) memcpy(coding[i] + 0, &(dataTmp[i * bs]), sizeof(char) * bs); - if (erased[FTI_Topo.groupRank]) // Decoding the lost data work + + // Decoding the lost data work + if (erased[FTI_Topo.groupRank]) jerasure_matrix_dotprod(k, FTI_Conf.l3WordSize, decMatrix + (FTI_Topo.groupRank * k), dm_ids, FTI_Topo.groupRank, data, coding, bs); + MPI_Allgather(data[FTI_Topo.groupRank] + 0, bs, MPI_CHAR, dataTmp, bs, MPI_CHAR, FTI_Exec.groupComm); for (i = 0; i < k; i++) memcpy(data[i] + 0, &(dataTmp[i * bs]), sizeof(char) * bs); - if (erased[FTI_Topo.groupRank + k]) // Finally, re-encode any erased encoded checkpoint file + + // Finally, re-encode any erased encoded checkpoint file + if (erased[FTI_Topo.groupRank + k]) jerasure_matrix_dotprod(k, FTI_Conf.l3WordSize, matrix + (FTI_Topo.groupRank * k), NULL, FTI_Topo.groupRank + k, data, coding, bs); if (erased[FTI_Topo.groupRank]) fwrite(data[FTI_Topo.groupRank] + 0, sizeof(char), bs, fd); if (erased[FTI_Topo.groupRank + k]) fwrite(coding[FTI_Topo.groupRank] + 0, sizeof(char), bs, efd); + pos = pos + bs; } + + // Closing files fclose(fd); - fclose(efd); // Closing files + fclose(efd); + if (truncate(fn, fs) == -1) { FTI_Print("R3 cannot re-truncate checkpoint file.", FTI_DBUG); From d16b2139731a924a097ae499983208ae4b685183 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 1 Mar 2016 16:42:08 +0100 Subject: [PATCH 57/93] Fixing 'ignoring number of bytes read' --- src/postreco.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/src/postreco.c b/src/postreco.c index af1cd601f..ff2fff9b6 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -501,11 +501,53 @@ int FTI_RecoverL2(int group) // Checkpoint files exchange while (pos < ps) { if (erased[src] && !erased[gs + FTI_Topo.groupRank]) { - fread(blBuf1, sizeof(char), FTI_Conf.blockSize, pfd); + (void)fread(blBuf1, sizeof(char), FTI_Conf.blockSize, pfd); + + if (ferror(pfd)) { + FTI_Print("Error reading the data from the partner ckpt. file.", FTI_DBUG); + + if (jfd) + fclose(jfd); + if (lfd) + fclose(lfd); + if (pfd) + fclose(pfd); + if (qfd) + fclose(qfd); + + free(blBuf1); + free(blBuf2); + free(blBuf3); + free(blBuf4); + + return FTI_NSCS; + } + MPI_Isend(blBuf1, FTI_Conf.blockSize, MPI_CHAR, src, FTI_Conf.tag, FTI_Exec.groupComm, &reqSend1); } if (erased[dest] && !erased[gs + FTI_Topo.groupRank]) { - fread(blBuf3, sizeof(char), FTI_Conf.blockSize, qfd); + (void)fread(blBuf3, sizeof(char), FTI_Conf.blockSize, qfd); + + if (ferror(qfd)) { + FTI_Print("Error reading the data from the ckpt. file.", FTI_DBUG); + + if (jfd) + fclose(jfd); + if (lfd) + fclose(lfd); + if (pfd) + fclose(pfd); + if (qfd) + fclose(qfd); + + free(blBuf1); + free(blBuf2); + free(blBuf3); + free(blBuf4); + + return FTI_NSCS; + } + MPI_Isend(blBuf3, FTI_Conf.blockSize, MPI_CHAR, dest, FTI_Conf.tag, FTI_Exec.groupComm, &reqSend2); } if (erased[FTI_Topo.groupRank]) { @@ -519,7 +561,9 @@ int FTI_RecoverL2(int group) if (erased[FTI_Topo.groupRank]) { MPI_Wait(&reqRecv1, &status); MPI_Wait(&reqRecv2, &status); - if (fwrite(blBuf2, sizeof(char), FTI_Conf.blockSize, lfd) != FTI_Conf.blockSize) { + + fwrite(blBuf2, sizeof(char), FTI_Conf.blockSize, lfd); + if (ferror(lfd)) { FTI_Print("Errors writting the data in the R2 checkpoint file.", FTI_DBUG); if (jfd) @@ -538,7 +582,9 @@ int FTI_RecoverL2(int group) return FTI_NSCS; } - if (fwrite(blBuf4, sizeof(char), FTI_Conf.blockSize, jfd) != FTI_Conf.blockSize) { + + fwrite(blBuf4, sizeof(char), FTI_Conf.blockSize, jfd); + if (ferror(jfd)) { FTI_Print("Errors writting the data in the R2 partner ckpt. file.", FTI_DBUG); if (jfd) From 43b9bd504f4b27e3ddc4dab67d99caf7cfc5220e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 1 Mar 2016 16:51:35 +0100 Subject: [PATCH 58/93] Fixing 'ignoring number of bytes read' --- src/postckpt.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/src/postckpt.c b/src/postckpt.c index 020573047..b0099955d 100644 --- a/src/postckpt.c +++ b/src/postckpt.c @@ -33,7 +33,7 @@ int FTI_Local(int group) @param group The group ID. @return integer FTI_SCES if successful. - This function copies the checkpoint files into the pertner node. It + This function copies the checkpoint files into the partner node. It follows a ring, where the ring size is the group size given in the FTI configuration file. @@ -85,21 +85,48 @@ int FTI_Ptner(int group) blBuf1 = talloc(char, FTI_Conf.blockSize); blBuf2 = talloc(char, FTI_Conf.blockSize); - while (pos < ps) { // Checkpoint files partner copy + // Checkpoint files partner copy + while (pos < ps) { if ((fs - pos) < FTI_Conf.blockSize) bSize = fs - pos; - fread(blBuf1, sizeof(char), bSize, lfd); + + (void)fread(blBuf1, sizeof(char), bSize, lfd); + if (ferror(lfd)) { + FTI_Print("Error reading data from the L2 ckpt. file", FTI_DBUG); + + free(blBuf1); + free(blBuf2); + fclose(lfd); + fclose(pfd); + + return FTI_NSCS; + } + MPI_Isend(blBuf1, FTI_Conf.blockSize, MPI_CHAR, dest, FTI_Conf.tag, FTI_Exec.groupComm, &reqSend); MPI_Irecv(blBuf2, FTI_Conf.blockSize, MPI_CHAR, src, FTI_Conf.tag, FTI_Exec.groupComm, &reqRecv); MPI_Wait(&reqSend, &status); MPI_Wait(&reqRecv, &status); + fwrite(blBuf2, sizeof(char), bSize, pfd); + if (ferror(pfd)) { + FTI_Print("Error writing data to the L2 partner file", FTI_DBUG); + + free(blBuf1); + free(blBuf2); + fclose(lfd); + fclose(pfd); + + return FTI_NSCS; + } + pos = pos + FTI_Conf.blockSize; } + free(blBuf1); free(blBuf2); fclose(lfd); fclose(pfd); + return FTI_SCES; } From 358c334f0637bf23dc0ffe2e18e568a69110a5a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 1 Mar 2016 17:01:39 +0100 Subject: [PATCH 59/93] Fixing 'ignoring number of bytes read' --- src/postckpt.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/src/postckpt.c b/src/postckpt.c index b0099955d..4c1abd294 100644 --- a/src/postckpt.c +++ b/src/postckpt.c @@ -295,7 +295,9 @@ int FTI_Flush(int group, int level) sprintf(lfn, "%s/%s", FTI_Ckpt[3].dir, FTI_Exec.ckptFile); break; } - sprintf(gfn, "%s/%s", FTI_Conf.gTmpDir, FTI_Exec.ckptFile); // Open and resize files + + // Open and resize files + sprintf(gfn, "%s/%s", FTI_Conf.gTmpDir, FTI_Exec.ckptFile); sprintf(str, "L4 trying to access local ckpt. file (%s).", lfn); FTI_Print(str, FTI_DBUG); if (access(lfn, R_OK) != 0) { @@ -323,11 +325,35 @@ int FTI_Flush(int group, int level) char *blBuf1 = talloc(char, FTI_Conf.blockSize); unsigned long bSize = FTI_Conf.blockSize; - while (pos < ps) { // Checkpoint files exchange + // Checkpoint files exchange + while (pos < ps) { if ((fs - pos) < FTI_Conf.blockSize) bSize = fs - pos; - fread(blBuf1, sizeof(char), bSize, lfd); - fwrite(blBuf1, sizeof(char), bSize, gfd); + + size_t bytes = fread(blBuf1, sizeof(char), bSize, lfd); + if (ferror(lfd)) { + FTI_Print("L4 cannot read from the ckpt. file.", FTI_EROR); + + free(blBuf1); + + fclose(lfd); + fclose(gfd); + + return FTI_NSCS; + } + + fwrite(blBuf1, sizeof(char), bytes, gfd); + if (ferror(gfd)) { + FTI_Print("L4 cannot write to the ckpt. file in the PFS.", FTI_EROR); + + free(blBuf1); + + fclose(lfd); + fclose(gfd); + + return FTI_NSCS; + } + pos = pos + FTI_Conf.blockSize; } From 096bdc30bff099bbdff48ba041a52a38e2a6d1db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 1 Mar 2016 17:04:29 +0100 Subject: [PATCH 60/93] Fixing 'ignoring number of bytes read' --- src/api.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/api.c b/src/api.c index 3c7ad7fee..a3aa4b4b3 100644 --- a/src/api.c +++ b/src/api.c @@ -382,7 +382,12 @@ int FTI_Recover() return FTI_NSCS; } for (i = 0; i < FTI_Exec.nbVar; i++) { - fread(FTI_Data[i].ptr, 1, FTI_Data[i].size, fd); + (void)fread(FTI_Data[i].ptr, 1, FTI_Data[i].size, fd); + if (ferror(fd)) { + FTI_Print("Could not read FTI checkpoint file.", FTI_EROR); + fclose(fd); + return FTI_NSCS; + } } if (fclose(fd) != 0) { FTI_Print("Could not close FTI checkpoint file.", FTI_EROR); From ff29394cd560c8c1d63a862be9eed97bac4614cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Wed, 2 Mar 2016 10:30:49 +0100 Subject: [PATCH 61/93] Fixing possible 'not null terminated string' bug --- src/topo.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/topo.c b/src/topo.c index 314b34bc4..6cd8c85f3 100644 --- a/src/topo.c +++ b/src/topo.c @@ -218,7 +218,7 @@ int FTI_BuildNodeList(int* nodeList, char* nameList) else { snprintf(lhn + (FTI_Topo.myRank * FTI_BUFS), FTI_BUFS, "node%d", FTI_Topo.myRank / FTI_Topo.nodeSize); // Local } - strncpy(hname, lhn + (FTI_Topo.myRank * FTI_BUFS), FTI_BUFS); // Distributing host names + strncpy(hname, lhn + (FTI_Topo.myRank * FTI_BUFS), FTI_BUFS - 1); // Distributing host names MPI_Allgather(hname, FTI_BUFS, MPI_CHAR, lhn, FTI_BUFS, MPI_CHAR, FTI_Exec.globalComm); for (i = 0; i < FTI_Topo.nbProc; i++) { // Creating the node list: For each process @@ -246,7 +246,7 @@ int FTI_BuildNodeList(int* nodeList, char* nameList) } } else { // ... else, we add the new node to the end of the current list of nodes - strncpy(&(nameList[pos * FTI_BUFS]), hname, FTI_BUFS); + strncpy(&(nameList[pos * FTI_BUFS]), hname, FTI_BUFS - 1); nodeList[pos * FTI_Topo.nodeSize] = i; nbNodes++; } From 9c08298ad2b4c12931ff3c387a3049a5dbd7b067 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Wed, 2 Mar 2016 10:44:07 +0100 Subject: [PATCH 62/93] Fixing 'dereference before null check' --- src/postreco.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/postreco.c b/src/postreco.c index ff2fff9b6..608f08700 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -506,12 +506,12 @@ int FTI_RecoverL2(int group) if (ferror(pfd)) { FTI_Print("Error reading the data from the partner ckpt. file.", FTI_DBUG); + fclose(pfd); + if (jfd) fclose(jfd); if (lfd) fclose(lfd); - if (pfd) - fclose(pfd); if (qfd) fclose(qfd); @@ -531,14 +531,14 @@ int FTI_RecoverL2(int group) if (ferror(qfd)) { FTI_Print("Error reading the data from the ckpt. file.", FTI_DBUG); + fclose(qfd); + if (jfd) fclose(jfd); if (lfd) fclose(lfd); if (pfd) fclose(pfd); - if (qfd) - fclose(qfd); free(blBuf1); free(blBuf2); @@ -566,10 +566,10 @@ int FTI_RecoverL2(int group) if (ferror(lfd)) { FTI_Print("Errors writting the data in the R2 checkpoint file.", FTI_DBUG); + fclose(lfd); + if (jfd) fclose(jfd); - if (lfd) - fclose(lfd); if (pfd) fclose(pfd); if (qfd) @@ -587,8 +587,8 @@ int FTI_RecoverL2(int group) if (ferror(jfd)) { FTI_Print("Errors writting the data in the R2 partner ckpt. file.", FTI_DBUG); - if (jfd) - fclose(jfd); + fclose(jfd); + if (lfd) fclose(lfd); if (pfd) From 610a84370c1eccc4837de75372f6c71a85c75624 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Wed, 2 Mar 2016 11:07:33 +0100 Subject: [PATCH 63/93] Fixing 'ignoring number of bytes read' --- src/api.c | 2 +- src/postckpt.c | 54 +++++++++++++++++++++++++++++++++++++++----------- src/postreco.c | 14 ++++++------- 3 files changed, 50 insertions(+), 20 deletions(-) diff --git a/src/api.c b/src/api.c index a3aa4b4b3..b718d695a 100644 --- a/src/api.c +++ b/src/api.c @@ -382,7 +382,7 @@ int FTI_Recover() return FTI_NSCS; } for (i = 0; i < FTI_Exec.nbVar; i++) { - (void)fread(FTI_Data[i].ptr, 1, FTI_Data[i].size, fd); + size_t bytes = fread(FTI_Data[i].ptr, 1, FTI_Data[i].size, fd); if (ferror(fd)) { FTI_Print("Could not read FTI checkpoint file.", FTI_EROR); fclose(fd); diff --git a/src/postckpt.c b/src/postckpt.c index 4c1abd294..44341c4a3 100644 --- a/src/postckpt.c +++ b/src/postckpt.c @@ -90,7 +90,7 @@ int FTI_Ptner(int group) if ((fs - pos) < FTI_Conf.blockSize) bSize = fs - pos; - (void)fread(blBuf1, sizeof(char), bSize, lfd); + size_t bytes = fread(blBuf1, sizeof(char), bSize, lfd); if (ferror(lfd)) { FTI_Print("Error reading data from the L2 ckpt. file", FTI_DBUG); @@ -102,7 +102,7 @@ int FTI_Ptner(int group) return FTI_NSCS; } - MPI_Isend(blBuf1, FTI_Conf.blockSize, MPI_CHAR, dest, FTI_Conf.tag, FTI_Exec.groupComm, &reqSend); + MPI_Isend(blBuf1, bytes, MPI_CHAR, dest, FTI_Conf.tag, FTI_Exec.groupComm, &reqSend); MPI_Irecv(blBuf2, FTI_Conf.blockSize, MPI_CHAR, src, FTI_Conf.tag, FTI_Exec.groupComm, &reqRecv); MPI_Wait(&reqSend, &status); MPI_Wait(&reqRecv, &status); @@ -193,31 +193,54 @@ int FTI_RSenc(int group) } } - while (pos < ps) { // For each block + // For each block + while (pos < ps) { if ((fs - pos) < bs) remBsize = fs - pos; - fread(myData, sizeof(char), remBsize, lfd); // Reading checkpoint files + + // Reading checkpoint files + size_t bytes = fread(myData, sizeof(char), remBsize, lfd); + if (ferror(lfd)) { + FTI_Print("FTI failed to read from L3 ckpt. file.", FTI_EROR); + + free(data); + free(matrix); + free(coding); + free(myData); + + fclose(lfd); + fclose(efd); + + return FTI_NSCS; + } + dest = FTI_Topo.groupRank; i = FTI_Topo.groupRank; offset = 0; init = 0; cnt = 0; - while (cnt < FTI_Topo.groupSize) { // For each encoding + + // For each encoding + while (cnt < FTI_Topo.groupSize) { if (cnt == 0) { - memcpy(&(data[offset * bs]), myData, sizeof(char) * bs); + memcpy(&(data[offset * bs]), myData, sizeof(char) * bytes); } else { MPI_Wait(&reqSend, &status); MPI_Wait(&reqRecv, &status); } - if (cnt != FTI_Topo.groupSize - 1) { // At every loop *but* the last one we send the data + + // At every loop *but* the last one we send the data + if (cnt != FTI_Topo.groupSize - 1) { dest = (dest + FTI_Topo.groupSize - 1) % FTI_Topo.groupSize; src = (i + 1) % FTI_Topo.groupSize; - MPI_Isend(myData, bs, MPI_CHAR, dest, FTI_Conf.tag, FTI_Exec.groupComm, &reqSend); + MPI_Isend(myData, bytes, MPI_CHAR, dest, FTI_Conf.tag, FTI_Exec.groupComm, &reqSend); MPI_Irecv(&(data[(1 - offset) * bs]), bs, MPI_CHAR, src, FTI_Conf.tag, FTI_Exec.groupComm, &reqRecv); } + matVal = matrix[FTI_Topo.groupRank * FTI_Topo.groupSize + i]; - if (matVal == 1) { // First copy or xor any data that does not need to be multiplied by a factor + // First copy or xor any data that does not need to be multiplied by a factor + if (matVal == 1) { if (init == 0) { memcpy(coding, &(data[offset * bs]), bs); init = 1; @@ -226,16 +249,23 @@ int FTI_RSenc(int group) galois_region_xor(&(data[offset * bs]), coding, coding, bs); } } - if (matVal != 0 && matVal != 1) { // Then the data that needs to be multiplied by a factor + + // Then the data that needs to be multiplied by a factor + if (matVal != 0 && matVal != 1) { galois_w16_region_multiply(&(data[offset * bs]), matVal, bs, coding, init); init = 1; } + i = (i + 1) % FTI_Topo.groupSize; offset = 1 - offset; cnt++; } - fwrite(coding, sizeof(char), remBsize, efd); // Writting encoded checkpoints - pos = pos + bs; // Next block + + // Writting encoded checkpoints + fwrite(coding, sizeof(char), remBsize, efd); + + // Next block + pos = pos + bs; } free(data); diff --git a/src/postreco.c b/src/postreco.c index 608f08700..104cb6647 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -159,8 +159,8 @@ int FTI_Decode(int fs, int maxFs, int* erased) while (pos < ps) { // Reading the data if (erased[FTI_Topo.groupRank] == 0) { - (void)fread(data[FTI_Topo.groupRank] + 0, sizeof(char), bs, fd); - (void)fread(coding[FTI_Topo.groupRank] + 0, sizeof(char), bs, efd); + size_t data_size = fread(data[FTI_Topo.groupRank] + 0, sizeof(char), bs, fd); + size_t coding_size = fread(coding[FTI_Topo.groupRank] + 0, sizeof(char), bs, efd); if (ferror(fd) || ferror(efd)) { FTI_Print("R3 cannot from the ckpt. file or the encoded ckpt. file.", FTI_DBUG); @@ -501,13 +501,13 @@ int FTI_RecoverL2(int group) // Checkpoint files exchange while (pos < ps) { if (erased[src] && !erased[gs + FTI_Topo.groupRank]) { - (void)fread(blBuf1, sizeof(char), FTI_Conf.blockSize, pfd); + size_t bytes = fread(blBuf1, sizeof(char), FTI_Conf.blockSize, pfd); if (ferror(pfd)) { FTI_Print("Error reading the data from the partner ckpt. file.", FTI_DBUG); fclose(pfd); - + if (jfd) fclose(jfd); if (lfd) @@ -523,10 +523,10 @@ int FTI_RecoverL2(int group) return FTI_NSCS; } - MPI_Isend(blBuf1, FTI_Conf.blockSize, MPI_CHAR, src, FTI_Conf.tag, FTI_Exec.groupComm, &reqSend1); + MPI_Isend(blBuf1, bytes, MPI_CHAR, src, FTI_Conf.tag, FTI_Exec.groupComm, &reqSend1); } if (erased[dest] && !erased[gs + FTI_Topo.groupRank]) { - (void)fread(blBuf3, sizeof(char), FTI_Conf.blockSize, qfd); + size_t bytes = fread(blBuf3, sizeof(char), FTI_Conf.blockSize, qfd); if (ferror(qfd)) { FTI_Print("Error reading the data from the ckpt. file.", FTI_DBUG); @@ -548,7 +548,7 @@ int FTI_RecoverL2(int group) return FTI_NSCS; } - MPI_Isend(blBuf3, FTI_Conf.blockSize, MPI_CHAR, dest, FTI_Conf.tag, FTI_Exec.groupComm, &reqSend2); + MPI_Isend(blBuf3, bytes, MPI_CHAR, dest, FTI_Conf.tag, FTI_Exec.groupComm, &reqSend2); } if (erased[FTI_Topo.groupRank]) { MPI_Irecv(blBuf2, FTI_Conf.blockSize, MPI_CHAR, dest, FTI_Conf.tag, FTI_Exec.groupComm, &reqRecv1); From e1d717ab525809531ce70e7b3c007fa8cf51ba20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Wed, 2 Mar 2016 11:19:49 +0100 Subject: [PATCH 64/93] Fixing 'buffer overflows' --- deps/iniparser/iniparser.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/deps/iniparser/iniparser.c b/deps/iniparser/iniparser.c index 5f6d6262e..d9e49ab1c 100644 --- a/deps/iniparser/iniparser.c +++ b/deps/iniparser/iniparser.c @@ -74,13 +74,18 @@ static char * strlwc(const char * s) static char * strstrip(const char * s) { static char l[ASCIILINESZ+1]; - char * last ; + char * last; + int len; if (s==NULL) return NULL ; while (isspace((int)*s) && *s) s++; memset(l, 0, ASCIILINESZ+1); - strcpy(l, s); + + len = (int)strlen(s); + if (len > ASCIILINESZ) + len = ASCIILINESZ; + strncpy(l, s, len); last = l + strlen(l); while (last > l) { if (!isspace((int)*(last-1))) @@ -562,7 +567,10 @@ static line_status iniparser_line( char line[ASCIILINESZ+1]; int len ; - strcpy(line, strstrip(input_line)); + len = (int)strlen(strstrip(input_line)); + if (len > ASCIILINESZ) + len = ASCIILINESZ; + strncpy(line, strstrip(input_line), len); len = (int)strlen(line); sta = LINE_UNPROCESSED ; From 775f868e2e0ddb52da78d98b6afe6282388150c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Wed, 2 Mar 2016 11:57:02 +0100 Subject: [PATCH 65/93] Fixing some TOCTOU problems by removing unnecessary access function calls --- src/api.c | 5 +---- src/meta.c | 1 - src/postckpt.c | 13 ++----------- src/postreco.c | 5 +---- src/tools.c | 2 +- 5 files changed, 5 insertions(+), 21 deletions(-) diff --git a/src/api.c b/src/api.c index b718d695a..2f7808f8f 100644 --- a/src/api.c +++ b/src/api.c @@ -372,10 +372,7 @@ int FTI_Recover() sprintf(fn, "%s/%s", FTI_Ckpt[FTI_Exec.ckptLvel].dir, FTI_Exec.ckptFile); sprintf(str, "Trying to load FTI checkpoint file (%s)...", fn); FTI_Print(str, FTI_DBUG); - if (access(fn, F_OK) != 0) { - FTI_Print("FTI checkpoint file is NOT accesible.", FTI_EROR); - return FTI_NSCS; - } + fd = fopen(fn, "rb"); if (fd == NULL) { FTI_Print("Could not open FTI checkpoint file.", FTI_EROR); diff --git a/src/meta.c b/src/meta.c index 0bad5ebe7..9e4c7e804 100644 --- a/src/meta.c +++ b/src/meta.c @@ -111,7 +111,6 @@ int FTI_WriteMetadata(unsigned long* fs, unsigned long mfs, char* fnl) if (access(FTI_Conf.mTmpDir, F_OK) != 0) { if (mkdir(FTI_Conf.mTmpDir, 0777) == -1) FTI_Print("Cannot create directory", FTI_EROR); - } sprintf(buf, "%s/sector%d-group%d.fti", FTI_Conf.mTmpDir, FTI_Topo.sectorID, FTI_Topo.groupID); if (remove(buf) == -1) diff --git a/src/postckpt.c b/src/postckpt.c index 44341c4a3..99cd18869 100644 --- a/src/postckpt.c +++ b/src/postckpt.c @@ -64,9 +64,7 @@ int FTI_Ptner(int group) sprintf(str, "L2 trying to access local ckpt. file (%s).", lfn); FTI_Print(str, FTI_DBUG); - res = FTI_Try(access(lfn, R_OK), " access the L2 checkpoint file."); - if (res == FTI_NSCS) - return FTI_NSCS; + dest = FTI_Topo.right; src = FTI_Topo.left; @@ -163,11 +161,9 @@ int FTI_RSenc(int group) sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &i); sprintf(lfn, "%s/%s", FTI_Conf.lTmpDir, FTI_Exec.ckptFile); sprintf(efn, "%s/Ckpt%d-RSed%d.fti", FTI_Conf.lTmpDir, FTI_Exec.ckptID, i); + sprintf(str, "L3 trying to access local ckpt. file (%s).", lfn); FTI_Print(str, FTI_DBUG); - res = FTI_Try(access(lfn, R_OK), "access the L3 checkpoint file."); - if (res != FTI_SCES) - return FTI_NSCS; lfd = fopen(lfn, "rb"); if (lfd == NULL) { @@ -330,11 +326,6 @@ int FTI_Flush(int group, int level) sprintf(gfn, "%s/%s", FTI_Conf.gTmpDir, FTI_Exec.ckptFile); sprintf(str, "L4 trying to access local ckpt. file (%s).", lfn); FTI_Print(str, FTI_DBUG); - if (access(lfn, R_OK) != 0) { - FTI_Print("L4 cannot access the checkpoint file.", FTI_EROR); - - return FTI_NSCS; - } lfd = fopen(lfn, "rb"); if (lfd == NULL) { diff --git a/src/postreco.c b/src/postreco.c index 104cb6647..c2e5a3fdd 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -849,10 +849,7 @@ int FTI_RecoverL4(int group) // Open and resize files sprintf(gfn, "%s/%s", FTI_Ckpt[4].dir, FTI_Exec.ckptFile); sprintf(lfn, "%s/%s", FTI_Ckpt[1].dir, FTI_Exec.ckptFile); - if (access(gfn, R_OK) != 0) { - FTI_Print("R4 cannot read the checkpoint file in the PFS.", FTI_DBUG); - return FTI_NSCS; - } + if (truncate(gfn, ps) == -1) { FTI_Print("R4 cannot truncate the ckpt. file in the PFS.", FTI_DBUG); return FTI_NSCS; diff --git a/src/tools.c b/src/tools.c index 23c7d4304..8e171bb0d 100644 --- a/src/tools.c +++ b/src/tools.c @@ -124,7 +124,7 @@ int FTI_InitBasicTypes(FTIT_dataset FTI_Data[FTI_BUFS]) int FTI_RmDir(char path[FTI_BUFS], int flag) { - if (flag && (!access(path, R_OK))) { + if (flag) { DIR* dp; char buf[FTI_BUFS], fn[FTI_BUFS], fil[FTI_BUFS]; struct dirent* ep; From 0629b728aedc52876641b1532bb6503134850b5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Wed, 2 Mar 2016 12:04:12 +0100 Subject: [PATCH 66/93] Adding missing bracket --- src/api.c | 2 +- src/meta.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/api.c b/src/api.c index 2f7808f8f..73706335a 100644 --- a/src/api.c +++ b/src/api.c @@ -372,7 +372,7 @@ int FTI_Recover() sprintf(fn, "%s/%s", FTI_Ckpt[FTI_Exec.ckptLvel].dir, FTI_Exec.ckptFile); sprintf(str, "Trying to load FTI checkpoint file (%s)...", fn); FTI_Print(str, FTI_DBUG); - + fd = fopen(fn, "rb"); if (fd == NULL) { FTI_Print("Could not open FTI checkpoint file.", FTI_EROR); diff --git a/src/meta.c b/src/meta.c index 9e4c7e804..0bad5ebe7 100644 --- a/src/meta.c +++ b/src/meta.c @@ -111,6 +111,7 @@ int FTI_WriteMetadata(unsigned long* fs, unsigned long mfs, char* fnl) if (access(FTI_Conf.mTmpDir, F_OK) != 0) { if (mkdir(FTI_Conf.mTmpDir, 0777) == -1) FTI_Print("Cannot create directory", FTI_EROR); + } sprintf(buf, "%s/sector%d-group%d.fti", FTI_Conf.mTmpDir, FTI_Topo.sectorID, FTI_Topo.groupID); if (remove(buf) == -1) From 02eac89011a452817019a5b445ab91fdfe7416c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 29 Mar 2016 18:25:35 +0200 Subject: [PATCH 67/93] Fixing uninitialized scalar variable in jerasure --- deps/jerasure/jerasure.c | 1 + 1 file changed, 1 insertion(+) diff --git a/deps/jerasure/jerasure.c b/deps/jerasure/jerasure.c index d5e0f1015..eb2433a8a 100644 --- a/deps/jerasure/jerasure.c +++ b/deps/jerasure/jerasure.c @@ -1254,6 +1254,7 @@ int **jerasure_smart_bitmatrix_to_schedule(int k, int m, int w, int *bitmatrix) ptr = bitmatrix; bestdiff = k*w+1; + bestrow = -1; top = 0; for (i = 0; i < m*w; i++) { no = 0; From 81bc5e3cb21774253892fde8c9288e885413b262 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 29 Mar 2016 19:09:16 +0200 Subject: [PATCH 68/93] Fixing dereference before null check --- src/postreco.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/postreco.c b/src/postreco.c index c2e5a3fdd..a685f752d 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -589,8 +589,7 @@ int FTI_RecoverL2(int group) fclose(jfd); - if (lfd) - fclose(lfd); + fclose(lfd); if (pfd) fclose(pfd); if (qfd) From 2d3c3c148bf9d94c456c0fc272d5785e089baf00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 29 Mar 2016 19:13:37 +0200 Subject: [PATCH 69/93] Fixing dead code --- deps/jerasure/galois.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/deps/jerasure/galois.c b/deps/jerasure/galois.c index 6bb42dc4d..167489e2a 100644 --- a/deps/jerasure/galois.c +++ b/deps/jerasure/galois.c @@ -393,8 +393,9 @@ int galois_single_divide(int a, int b, int w) sum_j = galois_inverse(b, w); return galois_single_multiply(a, sum_j, w); } - fprintf(stderr, "Galois_single_divide - no implementation for w=%d\n", w); - exit(1); + // unreachable code + //fprintf(stderr, "Galois_single_divide - no implementation for w=%d\n", w); + //exit(1); } int galois_shift_divide(int a, int b, int w) From 6439ca8d709b74a98d77bdaa76f2c23547eaeb55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 29 Mar 2016 19:45:09 +0200 Subject: [PATCH 70/93] Fixing improper use of negative value --- deps/jerasure/jerasure.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deps/jerasure/jerasure.c b/deps/jerasure/jerasure.c index eb2433a8a..f964ded67 100644 --- a/deps/jerasure/jerasure.c +++ b/deps/jerasure/jerasure.c @@ -400,6 +400,8 @@ int jerasure_invert_matrix(int *mat, int *inv, int rows, int w) tmp = mat[row_start+i]; if (tmp != 1) { inverse = galois_single_divide(1, tmp, w); + if (inverse < 0) + return -1; for (j = 0; j < cols; j++) { mat[row_start+j] = galois_single_multiply(mat[row_start+j], inverse, w); inv[row_start+j] = galois_single_multiply(inv[row_start+j], inverse, w); From adf9326c692251d4e19e81442ad1e058605c9461 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 29 Mar 2016 19:56:45 +0200 Subject: [PATCH 71/93] Fixing unused value --- deps/jerasure/jerasure.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deps/jerasure/jerasure.c b/deps/jerasure/jerasure.c index f964ded67..ee7201d70 100644 --- a/deps/jerasure/jerasure.c +++ b/deps/jerasure/jerasure.c @@ -1322,7 +1322,8 @@ int **jerasure_smart_bitmatrix_to_schedule(int k, int m, int w, int *bitmatrix) operations[op][1] = j%w; operations[op][2] = k+row/w; operations[op][3] = row%w; - optodo = 1; + // unused value + //optodo = 1; op++; } } From 618017e17972ccf70f0619d10faf59c5626a2d21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 29 Mar 2016 20:49:16 +0200 Subject: [PATCH 72/93] Fixing TOCTOU problems (1) --- src/conf.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/conf.c b/src/conf.c index c19dc5e33..9ec20d354 100644 --- a/src/conf.c +++ b/src/conf.c @@ -310,8 +310,8 @@ int FTI_CreateDirs() // Create metadata timestamp directory snprintf(fn, FTI_BUFS, "%s/%s", FTI_Conf.metadDir, FTI_Exec.id); - if (access(fn, F_OK) != 0) { - if (mkdir(fn, 0777) == -1) + if (mkdir(fn, 0777) == -1) { + if (errno != EEXIST) FTI_Print("Cannot create metadata timestamp directory", FTI_EROR); } snprintf(FTI_Conf.metadDir, FTI_BUFS, "%s", fn); @@ -324,8 +324,8 @@ int FTI_CreateDirs() // Create global checkpoint timestamp directory snprintf(fn, FTI_BUFS, "%s", FTI_Conf.glbalDir); snprintf(FTI_Conf.glbalDir, FTI_BUFS, "%s/%s", fn, FTI_Exec.id); - if (access(FTI_Conf.glbalDir, F_OK) != 0) { - if (mkdir(FTI_Conf.glbalDir, 0777) == -1) + if (mkdir(FTI_Conf.glbalDir, 0777) == -1) { + if (errno != EEXIST) FTI_Print("Cannot create global checkpoint timestamp directory", FTI_EROR); } snprintf(FTI_Conf.gTmpDir, FTI_BUFS, "%s/tmp", FTI_Conf.glbalDir); @@ -334,8 +334,8 @@ int FTI_CreateDirs() // Create local checkpoint timestamp directory if (FTI_Conf.test) { // If local test generate name by topology snprintf(fn, FTI_BUFS, "%s/node%d", FTI_Conf.localDir, FTI_Topo.myRank / FTI_Topo.nodeSize); - if (access(fn, F_OK) != 0) { - if (mkdir(fn, 0777) == -1) + if (mkdir(fn, 0777) == -1) { + if (errno != EEXIST) FTI_Print("Cannot create local checkpoint timestamp directory", FTI_EROR); } } @@ -343,8 +343,8 @@ int FTI_CreateDirs() snprintf(fn, FTI_BUFS, "%s", FTI_Conf.localDir); } snprintf(FTI_Conf.localDir, FTI_BUFS, "%s/%s", fn, FTI_Exec.id); - if (access(FTI_Conf.localDir, F_OK) != 0) { - if (mkdir(FTI_Conf.localDir, 0777) == -1) + if (mkdir(FTI_Conf.localDir, 0777) == -1) { + if (errno != EEXIST) FTI_Print("Cannot create local checkpoint timestamp directory", FTI_EROR); } snprintf(FTI_Conf.lTmpDir, FTI_BUFS, "%s/tmp", FTI_Conf.localDir); From 3d8a9bce5982b71f7ca3329bcec4142135cced94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 29 Mar 2016 20:55:23 +0200 Subject: [PATCH 73/93] Fixing TOCTOU problems (2) --- src/conf.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/conf.c b/src/conf.c index 9ec20d354..a49677237 100644 --- a/src/conf.c +++ b/src/conf.c @@ -257,11 +257,10 @@ int FTI_TestDirectories() char str[FTI_BUFS]; // Checking local directory - if (access(FTI_Conf.localDir, W_OK) != 0) { - sprintf(str, "Checking the local directory (%s)...", FTI_Conf.localDir); - FTI_Print(str, FTI_DBUG); - FTI_Print("The local directory does not exist or has no write access.", FTI_DBUG); - if (mkdir(FTI_Conf.localDir, 0777) != 0) { + sprintf(str, "Checking the local directory (%s)...", FTI_Conf.localDir); + FTI_Print(str, FTI_DBUG); + if (mkdir(FTI_Conf.localDir, 0777) != 0) { + if (errno != EEXIST) { FTI_Print("The local directory could NOT be created.", FTI_WARN); return FTI_NSCS; } @@ -271,9 +270,8 @@ int FTI_TestDirectories() // Checking metadata directory sprintf(str, "Checking the metadata directory (%s)...", FTI_Conf.metadDir); FTI_Print(str, FTI_DBUG); - if (access(FTI_Conf.metadDir, W_OK) != 0) { - FTI_Print("The metadata directory does not exist or has no write access.", FTI_DBUG); - if (mkdir(FTI_Conf.metadDir, 0777) != 0) { + if (mkdir(FTI_Conf.metadDir, 0777) != 0) { + if (errno != EEXIST) { FTI_Print("The metadata directory could NOT be created.", FTI_WARN); return FTI_NSCS; } @@ -282,9 +280,8 @@ int FTI_TestDirectories() // Checking global directory sprintf(str, "Checking the global directory (%s)...", FTI_Conf.glbalDir); FTI_Print(str, FTI_DBUG); - if (access(FTI_Conf.glbalDir, W_OK) != 0) { - FTI_Print("The global directory does not exist or has no write access.", FTI_DBUG); - if (mkdir(FTI_Conf.glbalDir, 0777) != 0) { + if (mkdir(FTI_Conf.glbalDir, 0777) != 0) { + if (errno != EEXIST) { FTI_Print("The global directory could NOT be created.", FTI_WARN); return FTI_NSCS; } From 76db07cff6dceb595be440c3ec26ec265307366c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 29 Mar 2016 20:56:41 +0200 Subject: [PATCH 74/93] Fixing TOCTOU problems (3) --- src/meta.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/meta.c b/src/meta.c index 0bad5ebe7..daae505c2 100644 --- a/src/meta.c +++ b/src/meta.c @@ -108,8 +108,8 @@ int FTI_WriteMetadata(unsigned long* fs, unsigned long mfs, char* fnl) // Remove topology section iniparser_unset(ini, "topology"); - if (access(FTI_Conf.mTmpDir, F_OK) != 0) { - if (mkdir(FTI_Conf.mTmpDir, 0777) == -1) + if (mkdir(FTI_Conf.mTmpDir, 0777) == -1) { + if (errno != EEXIST) FTI_Print("Cannot create directory", FTI_EROR); } From 066303ee6cd065165da783be77d2ea7e381be8f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 29 Mar 2016 20:57:38 +0200 Subject: [PATCH 75/93] Fixing TOCTOU problems (4) --- src/postckpt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/postckpt.c b/src/postckpt.c index 99cd18869..b61f13483 100644 --- a/src/postckpt.c +++ b/src/postckpt.c @@ -299,8 +299,8 @@ int FTI_Flush(int group, int level) if (res != FTI_SCES) return FTI_NSCS; - if (access(FTI_Conf.gTmpDir, F_OK) != 0) { - if (mkdir(FTI_Conf.gTmpDir, 0777) == -1) + if (mkdir(FTI_Conf.gTmpDir, 0777) == -1) { + if (errno != EEXIST) FTI_Print("Cannot create directory", FTI_EROR); } From c589e622a89c8f90cbe32756f056b7fd29d8695d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 29 Mar 2016 20:59:46 +0200 Subject: [PATCH 76/93] Fixing TOCTOU problems (5) --- src/postreco.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/postreco.c b/src/postreco.c index a685f752d..dc299739e 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -332,8 +332,8 @@ int FTI_RecoverL2(int group) src = FTI_Topo.left; dest = FTI_Topo.right; - if (access(FTI_Ckpt[2].dir, F_OK) != 0) - if (mkdir(FTI_Ckpt[2].dir, 0777) == -1) + if (mkdir(FTI_Ckpt[2].dir, 0777) == -1) + if (errno != EEXIST) FTI_Print("Cannot create directory", FTI_EROR); // Checking erasures From 79f89eb6258761825162cfe4202cc76d692a7a90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 29 Mar 2016 21:00:47 +0200 Subject: [PATCH 77/93] Fixing TOCTOU problems (6) --- src/postreco.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/postreco.c b/src/postreco.c index dc299739e..d31c47640 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -30,8 +30,8 @@ int FTI_Decode(int fs, int maxFs, int* erased) if (ps < maxFs) ps = ps + FTI_Conf.blockSize; // Calculating padding size - if (access(FTI_Ckpt[3].dir, F_OK) != 0) - if (mkdir(FTI_Ckpt[3].dir, 0777) == -1) + if (mkdir(FTI_Ckpt[3].dir, 0777) == -1) + if (errno != EEXIST) FTI_Print("Cannot create directory", FTI_EROR); sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &i); From 3aa26d6df106e8e4c9166479e836af6e80b3f1e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 29 Mar 2016 21:01:36 +0200 Subject: [PATCH 78/93] Fixing TOCTOU problems (7) --- src/postreco.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/postreco.c b/src/postreco.c index d31c47640..080b7006b 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -755,8 +755,8 @@ int FTI_RecoverL3(int group) char str[FTI_BUFS]; gs = FTI_Topo.groupSize; - if (access(FTI_Ckpt[3].dir, F_OK) != 0) - if (mkdir(FTI_Ckpt[3].dir, 0777) == -1) + if (mkdir(FTI_Ckpt[3].dir, 0777) == -1) + if (errno != EEXIST) FTI_Print("Cannot create directory", FTI_EROR); // Checking erasures From 2de45de0fa0a2100d9a4479e98f428988282c18c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 29 Mar 2016 21:04:45 +0200 Subject: [PATCH 79/93] Fixing TOCTOU problems (8) --- src/postreco.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/postreco.c b/src/postreco.c index 080b7006b..c89932a51 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -812,14 +812,9 @@ int FTI_RecoverL4(int group) gs = FTI_Topo.groupSize; if (FTI_Topo.nodeRank == 0 || FTI_Topo.nodeRank == 1) { - if (access(FTI_Ckpt[1].dir, F_OK) != 0) { - FTI_Print("Directory L1 missing.", FTI_DBUG); - if (mkdir(FTI_Ckpt[1].dir, 0777) == 0) { - FTI_Print("Directory L1 created.", FTI_DBUG); - } - else { + if (mkdir(FTI_Ckpt[1].dir, 0777) == -1) { + if (errno != EEXIST) FTI_Print("Directory L1 could NOT be created.", FTI_WARN); - } } } MPI_Barrier(FTI_COMM_WORLD); From 649ce7feb53545c828e9672e000e0ea9c16b44a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Thu, 31 Mar 2016 20:13:25 +0200 Subject: [PATCH 80/93] Adding option for building of examples --- CMakeLists.txt | 5 ++++- examples/CMakeLists.txt | 3 --- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 34ea45c4d..709f86f2b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,7 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") endif() option(ENABLE_FORTRAN "Enables the generation of the Fortran wrapper for FTI" ON) +option(ENABLE_EXAMPLES "Enables the generation of examples" OFF) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeScripts") include(AppendProperty) @@ -77,4 +78,6 @@ if(NOT "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") set(FTI_INCLUDE_PATH "${CMAKE_CURRENT_BINARY_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/include" PARENT_SCOPE) endif() -add_subdirectory(examples) +if(ENABLE_EXAMPLES) + add_subdirectory(examples) +endif() diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d720c1b59..6c06176e1 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,6 +1,5 @@ link_directories(${CMAKE_BINARY_DIR}/lib) - add_executable(hd.exe heatdis.c) target_link_libraries(hd.exe fti ${MPI_C_LIBRARIES} m) @@ -14,7 +13,6 @@ if(MPI_C_LINK_FLAGS) LINK_FLAGS "${MPI_C_LINK_FLAGS}") endif() - add_executable(hd2.exe heatd2.c) target_link_libraries(hd2.exe fti ${MPI_C_LIBRARIES} m) @@ -28,7 +26,6 @@ if(MPI_C_LINK_FLAGS) LINK_FLAGS "${MPI_C_LINK_FLAGS}") endif() - if(ENABLE_FORTRAN) add_executable(hdf.exe fheatdis.f90) target_link_libraries(hdf.exe fti_f90 fti ${MPI_Fortran_LIBRARIES} m) From 329989f1cef8cd1f378e11f7938b2ce4c276c97e Mon Sep 17 00:00:00 2001 From: Leonardo Bautista Gomez Date: Tue, 26 Apr 2016 16:51:05 +0200 Subject: [PATCH 81/93] Add memset 0 before using buffer --- deps/iniparser/iniparser.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deps/iniparser/iniparser.c b/deps/iniparser/iniparser.c index d9e49ab1c..bafb77a60 100644 --- a/deps/iniparser/iniparser.c +++ b/deps/iniparser/iniparser.c @@ -567,6 +567,7 @@ static line_status iniparser_line( char line[ASCIILINESZ+1]; int len ; + memset(line, 0, ASCIILINESZ); len = (int)strlen(strstrip(input_line)); if (len > ASCIILINESZ) len = ASCIILINESZ; @@ -616,6 +617,7 @@ static line_status iniparser_line( } else { /* Generate syntax error */ sta = LINE_ERROR ; + printf("===== > %s ===> %s\n", input_line, line); } return sta ; } From 09903943c48aa198b5af1a8fbed49819519d334d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Wed, 6 Apr 2016 23:30:43 +0200 Subject: [PATCH 82/93] Fixing 'undefined references' in examples --- CMakeLists.txt | 16 +++++++++------- deps/iniparser/CMakeLists.txt | 2 +- deps/jerasure/CMakeLists.txt | 2 +- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 709f86f2b..5ae08d395 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,10 +1,10 @@ -cmake_minimum_required(VERSION 2.8 FATAL_ERROR) +cmake_minimum_required(VERSION 2.8.8 FATAL_ERROR) if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") project(FTI C Fortran) endif() -option(ENABLE_FORTRAN "Enables the generation of the Fortran wrapper for FTI" ON) +option(ENABLE_FORTRAN "Enables the generation of the Fortran wrapper for FTI" OFF) option(ENABLE_EXAMPLES "Enables the generation of examples" OFF) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeScripts") @@ -31,17 +31,19 @@ set(SRC_FTI append_property(SOURCE ${SRC_FTI} PROPERTY COMPILE_FLAGS "${MPI_C_COMPILE_FLAGS}") -add_library(fti.static STATIC ${SRC_FTI}) -target_link_libraries(fti.static ${MPI_C_LIBRARIES} iniparser jerasure) - -add_library(fti.shared SHARED ${SRC_FTI}) -target_link_libraries(fti.shared ${MPI_C_LIBRARIES} iniparser jerasure) +add_library(fti.static STATIC ${SRC_FTI} + $ $) +add_library(fti.shared SHARED ${SRC_FTI} + $ $) append_property(TARGET fti.static fti.shared PROPERTY LINK_FLAGS "${MPI_C_LINK_FLAGS}") set_property(TARGET fti.static fti.shared PROPERTY OUTPUT_NAME fti) +target_link_libraries(fti.static ${MPI_C_LIBRARIES}) +target_link_libraries(fti.shared ${MPI_C_LIBRARIES}) + install(TARGETS fti.static fti.shared DESTINATION lib) install(FILES "include/fti.h" diff --git a/deps/iniparser/CMakeLists.txt b/deps/iniparser/CMakeLists.txt index 4de9fe09c..27b4cd0a7 100644 --- a/deps/iniparser/CMakeLists.txt +++ b/deps/iniparser/CMakeLists.txt @@ -5,4 +5,4 @@ set(SRC_iniparser append_property(SOURCE ${SRC_iniparser} PROPERTY COMPILE_FLAGS "-fPIC") -add_library(iniparser ${SRC_iniparser}) +add_library(iniparser OBJECT ${SRC_iniparser}) diff --git a/deps/jerasure/CMakeLists.txt b/deps/jerasure/CMakeLists.txt index 0e0c0df41..b2bc59d2f 100644 --- a/deps/jerasure/CMakeLists.txt +++ b/deps/jerasure/CMakeLists.txt @@ -5,4 +5,4 @@ set(SRC_jerasure append_property(SOURCE ${SRC_jerasure} PROPERTY COMPILE_FLAGS "-fPIC") -add_library(jerasure ${SRC_jerasure}) +add_library(jerasure OBJECT ${SRC_jerasure}) From a50929901a27bb6d194aaa01a1878f8e5ea1eb02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Wed, 6 Apr 2016 23:58:02 +0200 Subject: [PATCH 83/93] Fixing problem with too low version of cmake in travis environment --- .travis.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index a59b617ce..dac43a37b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,3 +1,6 @@ +sudo: required +dist: trusty + language: c env: @@ -26,7 +29,7 @@ cache: before_install: - sudo apt-get update -qq - - sudo apt-get install -y openmpi-bin libopenmpi-dev gfortran + - sudo apt-get install -y cmake gcc gfortran openmpi-bin libopenmpi-dev before_script: - mkdir $BUILD_DIR From 596c00930e5c0fdd02f2da288e7638ad9199f508 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Thu, 7 Apr 2016 14:26:21 +0200 Subject: [PATCH 84/93] Fixing problems with reading of the config file in examples --- deps/iniparser/iniparser.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/deps/iniparser/iniparser.c b/deps/iniparser/iniparser.c index bafb77a60..6d2772c98 100644 --- a/deps/iniparser/iniparser.c +++ b/deps/iniparser/iniparser.c @@ -75,17 +75,12 @@ static char * strstrip(const char * s) { static char l[ASCIILINESZ+1]; char * last; - int len; if (s==NULL) return NULL ; while (isspace((int)*s) && *s) s++; memset(l, 0, ASCIILINESZ+1); - - len = (int)strlen(s); - if (len > ASCIILINESZ) - len = ASCIILINESZ; - strncpy(l, s, len); + strncpy(l, s, ASCIILINESZ); last = l + strlen(l); while (last > l) { if (!isspace((int)*(last-1))) @@ -567,7 +562,7 @@ static line_status iniparser_line( char line[ASCIILINESZ+1]; int len ; - memset(line, 0, ASCIILINESZ); + memset(line, 0, ASCIILINESZ + 1); len = (int)strlen(strstrip(input_line)); if (len > ASCIILINESZ) len = ASCIILINESZ; From 86e3be454095dc3683f461b2c9abb8e9d1c43139 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Wed, 1 Jun 2016 12:32:28 +0200 Subject: [PATCH 85/93] Moving some library global variables into static --- examples/CMakeLists.txt | 4 +- examples/Makefile | 8 +- examples/heatdis.c | 2 + include/fti.h | 8 -- src/api.c | 82 ++++++++++++++---- src/checkpoint.c | 145 +++++++++++++++++--------------- src/conf.c | 182 +++++++++++++++++++++------------------- src/interface.h | 114 ++++++++++++++++--------- src/meta.c | 59 +++++++------ src/postckpt.c | 108 ++++++++++++------------ src/postreco.c | 182 +++++++++++++++++++++------------------- src/recover.c | 80 +++++++++--------- src/tools.c | 103 ++++++++--------------- src/topo.c | 167 ++++++++++++++++++------------------ 14 files changed, 666 insertions(+), 578 deletions(-) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 6c06176e1..a0e507bda 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,7 +1,7 @@ link_directories(${CMAKE_BINARY_DIR}/lib) add_executable(hd.exe heatdis.c) -target_link_libraries(hd.exe fti ${MPI_C_LIBRARIES} m) +target_link_libraries(hd.exe ${CMAKE_BINARY_DIR}/lib/libfti.a ${MPI_C_LIBRARIES} m) if(MPI_C_COMPILE_FLAGS) set_target_properties(hd.exe PROPERTIES @@ -14,7 +14,7 @@ if(MPI_C_LINK_FLAGS) endif() add_executable(hd2.exe heatd2.c) -target_link_libraries(hd2.exe fti ${MPI_C_LIBRARIES} m) +target_link_libraries(hd2.exe ${CMAKE_BINARY_DIR}/lib/libfti.a ${MPI_C_LIBRARIES} m) if(MPI_C_COMPILE_FLAGS) set_target_properties(hd2.exe PROPERTIES diff --git a/examples/Makefile b/examples/Makefile index 6a58f8c72..638de22b5 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -11,11 +11,11 @@ MPIRUN ?= mpirun ## FLAGS # Compiling using shared library -FTIFLAG = -I$(FTIPATH)/include -L$(FTIPATH)/lib -lfti -lm -FFTIFLAG = -I$(FTIPATH)/include -L$(FTIPATH)/lib -lfti_f90 -lfti -lm +#FTIFLAG = -I$(FTIPATH)/include -L$(FTIPATH)/lib -lfti -lm +#FFTIFLAG = -I$(FTIPATH)/include -L$(FTIPATH)/lib -lfti_f90 -lfti -lm # Compiling using static library -#FTIFLAG = -I$(FTIPATH)/include $(FTIPATH)/lib/libfti.a -lm -#FFTIFLAG = -I$(FTIPATH)/include $(FTIPATH)/lib/libfti_f90.a $(FTIPATH)/lib/libfti.a -lm +FTIFLAG = -I$(FTIPATH)/include $(FTIPATH)/lib/libfti.a -lm +FFTIFLAG = -I$(FTIPATH)/include $(FTIPATH)/lib/libfti_f90.a $(FTIPATH)/lib/libfti.a -lm ## TARGETS all: hd hd2 hdf diff --git a/examples/heatdis.c b/examples/heatdis.c index 9783be205..6523e5bf4 100644 --- a/examples/heatdis.c +++ b/examples/heatdis.c @@ -100,6 +100,7 @@ int main(int argc, char *argv[]) MPI_Init(&argc, &argv); FTI_Init(argv[2], MPI_COMM_WORLD); + MPI_Comm_size(FTI_COMM_WORLD, &nbProcs); MPI_Comm_rank(FTI_COMM_WORLD, &rank); @@ -110,6 +111,7 @@ int main(int argc, char *argv[]) g = (double *) malloc(sizeof(double *) * M * nbLines); initData(nbLines, M, rank, g); memSize = M * nbLines * 2 * sizeof(double) / (1024 * 1024); + if (rank == 0) printf("Local data size is %d x %d = %f MB (%d).\n", M, nbLines, memSize, arg); if (rank == 0) printf("Target precision : %f \n", PRECISION); if (rank == 0) printf("Maximum number of iterations : %d \n", ITER_TIMES); diff --git a/include/fti.h b/include/fti.h index f7c0f38af..f30de3666 100644 --- a/include/fti.h +++ b/include/fti.h @@ -218,14 +218,6 @@ typedef struct FTIT_injection { /** FTI type declarator. */ /** MPI communicator that splits the global one into app and FTI appart. */ MPI_Comm FTI_COMM_WORLD; -/** Topology of the system. */ -FTIT_topology FTI_Topo; -/** Dynamic information for this execution. */ -FTIT_execution FTI_Exec; -/** Checkpoint information for all levels of checkpoint. */ -FTIT_checkpoint FTI_Ckpt[5]; -/** General configuration information used by FTI. */ -FTIT_configuration FTI_Conf; /** FTI data type for chars. */ FTIT_type FTI_CHAR; diff --git a/src/api.c b/src/api.c index 73706335a..1fe6af92d 100644 --- a/src/api.c +++ b/src/api.c @@ -7,6 +7,18 @@ #include "interface.h" +/** General configuration information used by FTI. */ +static FTIT_configuration FTI_Conf; + +/** Checkpoint information for all levels of checkpoint. */ +static FTIT_checkpoint FTI_Ckpt[5]; + +/** Dynamic information for this execution. */ +static FTIT_execution FTI_Exec; + +/** Topology of the system. */ +static FTIT_topology FTI_Topo; + /** Array of datasets and all their internal information. */ static FTIT_dataset FTI_Data[FTI_BUFS]; @@ -23,7 +35,7 @@ static FTIT_injection FTI_Inje; /*-------------------------------------------------------------------------*/ void FTI_Abort() { - FTI_Clean(5, 0, FTI_Topo.myRank); + FTI_Clean(&FTI_Conf, &FTI_Topo, FTI_Ckpt, 5, 0, FTI_Topo.myRank); MPI_Abort(MPI_COMM_WORLD, -1); MPI_Finalize(); exit(1); @@ -53,31 +65,31 @@ int FTI_Init(char* configFile, MPI_Comm globalComm) FTI_Inje.timer = MPI_Wtime(); FTI_COMM_WORLD = globalComm; // Temporary before building topology FTI_Topo.splitRank = FTI_Topo.myRank; // Temporary before building topology - int res = FTI_Try(FTI_LoadConf(&FTI_Inje), "load configuration."); + int res = FTI_Try(FTI_LoadConf(&FTI_Conf, &FTI_Exec, &FTI_Topo, FTI_Ckpt, &FTI_Inje), "load configuration."); if (res == FTI_NSCS) FTI_Abort(); - res = FTI_Try(FTI_Topology(), "build topology."); + res = FTI_Try(FTI_Topology(&FTI_Conf, &FTI_Exec, &FTI_Topo), "build topology."); if (res == FTI_NSCS) FTI_Abort(); FTI_Try(FTI_InitBasicTypes(FTI_Data), "create the basic data types."); if (FTI_Topo.myRank == 0) - FTI_Try(FTI_UpdateConf(1), "update configuration file."); + FTI_Try(FTI_UpdateConf(&FTI_Conf, &FTI_Exec, 1), "update configuration file."); if (FTI_Topo.amIaHead) { // If I am a FTI dedicated process if (FTI_Exec.reco) { - res = FTI_Try(FTI_RecoverFiles(), "recover the checkpoint files."); + res = FTI_Try(FTI_RecoverFiles(&FTI_Conf, &FTI_Exec, &FTI_Topo, FTI_Ckpt), "recover the checkpoint files."); if (res == FTI_NSCS) FTI_Abort(); } res = 0; while (res != FTI_ENDW) { - res = FTI_Listen(); + res = FTI_Listen(&FTI_Conf, &FTI_Exec, &FTI_Topo, FTI_Ckpt); } FTI_Print("Head stopped listening.", FTI_DBUG); FTI_Finalize(); } else { // If I am an application process if (FTI_Exec.reco) { - res = FTI_Try(FTI_RecoverFiles(), "recover the checkpoint files."); + res = FTI_Try(FTI_RecoverFiles(&FTI_Conf, &FTI_Exec, &FTI_Topo, FTI_Ckpt), "recover the checkpoint files."); if (res == FTI_NSCS) FTI_Abort(); FTI_Exec.ckptCnt = FTI_Exec.ckptID; @@ -164,7 +176,7 @@ int FTI_Protect(int id, void* ptr, long count, FTIT_type type) else { if (FTI_Exec.nbVar >= FTI_BUFS) { FTI_Print("Too many variables registered.", FTI_EROR); - FTI_Clean(5, FTI_Topo.groupID, FTI_Topo.myRank); + FTI_Clean(&FTI_Conf, &FTI_Topo, FTI_Ckpt, 5, FTI_Topo.groupID, FTI_Topo.myRank); MPI_Abort(MPI_COMM_WORLD, -1); MPI_Finalize(); exit(1); @@ -319,7 +331,7 @@ int FTI_Checkpoint(int id, int level) } } t1 = MPI_Wtime(); - res = FTI_Try(FTI_WriteCkpt(FTI_Data), "write the checkpoint."); + res = FTI_Try(FTI_WriteCkpt(&FTI_Conf, &FTI_Exec, &FTI_Topo, FTI_Ckpt, FTI_Data), "write the checkpoint."); //MPI_Allreduce(&res, &tres, 1, MPI_INT, MPI_SUM, FTI_COMM_WORLD); t2 = MPI_Wtime(); if (!FTI_Ckpt[FTI_Exec.ckptLvel].isInline) { // If postCkpt. work is Async. then send message.. @@ -336,7 +348,7 @@ int FTI_Checkpoint(int id, int level) FTI_Exec.wasLastOffline = 0; if (res != FTI_SCES) FTI_Exec.ckptLvel = FTI_REJW - FTI_BASE; - res = FTI_Try(FTI_PostCkpt(FTI_Topo.groupID, -1, 1), "postprocess the checkpoint."); + res = FTI_Try(FTI_PostCkpt(&FTI_Conf, &FTI_Exec, &FTI_Topo, FTI_Ckpt, FTI_Topo.groupID, -1, 1), "postprocess the checkpoint."); if (res == FTI_SCES) { FTI_Exec.wasLastOffline = 0; FTI_Exec.lastCkptLvel = FTI_Exec.ckptLvel; @@ -414,7 +426,7 @@ int FTI_Snapshot() res = FTI_Try(FTI_Recover(), "recover the checkpointed data."); if (res == FTI_NSCS) { FTI_Print("Impossible to load the checkpoint data.", FTI_EROR); - FTI_Clean(5, FTI_Topo.groupID, FTI_Topo.myRank); + FTI_Clean(&FTI_Conf, &FTI_Topo, FTI_Ckpt, 5, FTI_Topo.groupID, FTI_Topo.myRank); MPI_Abort(MPI_COMM_WORLD, -1); MPI_Finalize(); exit(1); @@ -422,7 +434,7 @@ int FTI_Snapshot() } else { // If it is a checkpoint test res = FTI_SCES; - FTI_UpdateIterTime(); + FTI_UpdateIterTime(&FTI_Exec); if (FTI_Exec.ckptNext == FTI_Exec.ckptIcnt) { // If it is time to check for possible ckpt. (every minute) FTI_Print("Checking if it is time to checkpoint.", FTI_DBUG); FTI_Exec.ckptCnt++; // Increment minute counter @@ -485,7 +497,7 @@ int FTI_Finalize() // If we need to keep the last checkpoint if (FTI_Conf.saveLastCkpt) { if (FTI_Exec.lastCkptLvel != 4) { - FTI_Try(FTI_Flush(FTI_Topo.groupID, FTI_Exec.lastCkptLvel), "save the last ckpt. in the PFS."); + FTI_Try(FTI_Flush(&FTI_Conf, &FTI_Exec, &FTI_Topo, FTI_Ckpt, FTI_Topo.groupID, FTI_Exec.lastCkptLvel), "save the last ckpt. in the PFS."); MPI_Barrier(FTI_COMM_WORLD); if (FTI_Topo.splitRank == 0) { if (access(FTI_Ckpt[4].dir, 0) == 0) @@ -500,21 +512,59 @@ int FTI_Finalize() } } if (FTI_Topo.splitRank == 0) { - FTI_Try(FTI_UpdateConf(2), "update configuration file to 2."); + FTI_Try(FTI_UpdateConf(&FTI_Conf, &FTI_Exec, 2), "update configuration file to 2."); } buff = 6; // For cleaning only local storage } else { if (FTI_Topo.splitRank == 0) { - FTI_Try(FTI_UpdateConf(0), "update configuration file to 0."); + FTI_Try(FTI_UpdateConf(&FTI_Conf, &FTI_Exec, 0), "update configuration file to 0."); } buff = 5; // For cleaning everything } MPI_Barrier(FTI_Exec.globalComm); - FTI_Try(FTI_Clean(buff, FTI_Topo.groupID, FTI_Topo.myRank), "do final clean."); + FTI_Try(FTI_Clean(&FTI_Conf, &FTI_Topo, FTI_Ckpt, buff, FTI_Topo.groupID, FTI_Topo.myRank), "do final clean."); FTI_Print("FTI has been finalized.", FTI_INFO); return FTI_SCES; } +/*-------------------------------------------------------------------------*/ +/** + @brief Prints FTI messages. + @param msg Message to print. + @param priority Priority of the message to be printed. + @return void + + This function prints messages depending on their priority and the + verbosity level set by the user. DEBUG messages are printed by all + processes with their rank. INFO messages are printed by one process. + ERROR messages are printed with errno. + **/ +/*-------------------------------------------------------------------------*/ +void FTI_Print(char* msg, int priority) +{ + if (priority >= FTI_Conf.verbosity) { + if (msg != NULL) { + switch (priority) { + case FTI_EROR: + fprintf(stderr, "[FTI Error - %06d] : %s : %s \n", FTI_Topo.myRank, msg, strerror(errno)); + break; + case FTI_WARN: + fprintf(stdout, "[FTI Warning %06d] : %s \n", FTI_Topo.myRank, msg); + break; + case FTI_INFO: + if (FTI_Topo.splitRank == 0) + fprintf(stdout, "[ FTI Information ] : %s \n", msg); + break; + case FTI_DBUG: + fprintf(stdout, "[FTI Debug - %06d] : %s \n", FTI_Topo.myRank, msg); + break; + default: + break; + } + } + } + fflush(stdout); +} diff --git a/src/checkpoint.c b/src/checkpoint.c index 8536fe7bd..9f9629f5d 100644 --- a/src/checkpoint.c +++ b/src/checkpoint.c @@ -18,39 +18,39 @@ **/ /*-------------------------------------------------------------------------*/ -int FTI_UpdateIterTime() +int FTI_UpdateIterTime(FTIT_execution* FTI_Exec) { int nbProcs, res; char str[FTI_BUFS]; - double last = FTI_Exec.iterTime; - FTI_Exec.iterTime = MPI_Wtime(); - if (FTI_Exec.ckptIcnt > 0) { - FTI_Exec.lastIterTime = FTI_Exec.iterTime - last; - FTI_Exec.totalIterTime = FTI_Exec.totalIterTime + FTI_Exec.lastIterTime; - if (FTI_Exec.ckptIcnt % FTI_Exec.syncIter == 0) { - FTI_Exec.meanIterTime = FTI_Exec.totalIterTime / FTI_Exec.ckptIcnt; - MPI_Allreduce(&FTI_Exec.meanIterTime, &FTI_Exec.globMeanIter, 1, MPI_DOUBLE, MPI_SUM, FTI_COMM_WORLD); + double last = FTI_Exec->iterTime; + FTI_Exec->iterTime = MPI_Wtime(); + if (FTI_Exec->ckptIcnt > 0) { + FTI_Exec->lastIterTime = FTI_Exec->iterTime - last; + FTI_Exec->totalIterTime = FTI_Exec->totalIterTime + FTI_Exec->lastIterTime; + if (FTI_Exec->ckptIcnt % FTI_Exec->syncIter == 0) { + FTI_Exec->meanIterTime = FTI_Exec->totalIterTime / FTI_Exec->ckptIcnt; + MPI_Allreduce(&FTI_Exec->meanIterTime, &FTI_Exec->globMeanIter, 1, MPI_DOUBLE, MPI_SUM, FTI_COMM_WORLD); MPI_Comm_size(FTI_COMM_WORLD, &nbProcs); - FTI_Exec.globMeanIter = FTI_Exec.globMeanIter / nbProcs; - if (FTI_Exec.globMeanIter > 60) { - FTI_Exec.ckptIntv = 1; + FTI_Exec->globMeanIter = FTI_Exec->globMeanIter / nbProcs; + if (FTI_Exec->globMeanIter > 60) { + FTI_Exec->ckptIntv = 1; } else { - FTI_Exec.ckptIntv = (1 * 60) / FTI_Exec.globMeanIter; + FTI_Exec->ckptIntv = (1 * 60) / FTI_Exec->globMeanIter; } - res = FTI_Exec.ckptLast + FTI_Exec.ckptIntv; - if (res >= FTI_Exec.ckptIcnt) { - FTI_Exec.ckptNext = res; + res = FTI_Exec->ckptLast + FTI_Exec->ckptIntv; + if (res >= FTI_Exec->ckptIcnt) { + FTI_Exec->ckptNext = res; } - if (FTI_Exec.syncIter < (FTI_Exec.ckptIntv / 2)) { - FTI_Exec.syncIter = FTI_Exec.syncIter * 2; + if (FTI_Exec->syncIter < (FTI_Exec->ckptIntv / 2)) { + FTI_Exec->syncIter = FTI_Exec->syncIter * 2; sprintf(str, "Iteration frequency : %.2f sec/iter => %d iter/min. Resync every %d iter.", - FTI_Exec.globMeanIter, FTI_Exec.ckptIntv, FTI_Exec.syncIter); + FTI_Exec->globMeanIter, FTI_Exec->ckptIntv, FTI_Exec->syncIter); FTI_Print(str, FTI_DBUG); } } } - FTI_Exec.ckptIcnt++; // Increment checkpoint loop counter + FTI_Exec->ckptIcnt++; // Increment checkpoint loop counter return FTI_SCES; } @@ -66,7 +66,10 @@ int FTI_UpdateIterTime() **/ /*-------------------------------------------------------------------------*/ -int FTI_WriteCkpt(FTIT_dataset* FTI_Data) +int FTI_WriteCkpt(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, + FTIT_dataset* FTI_Data) + { int i, res; FILE* fd; @@ -74,16 +77,20 @@ int FTI_WriteCkpt(FTIT_dataset* FTI_Data) double tt = MPI_Wtime(); - snprintf(FTI_Exec.ckptFile, FTI_BUFS, "Ckpt%d-Rank%d.fti", FTI_Exec.ckptID, FTI_Topo.myRank); - if (FTI_Ckpt[4].isInline && FTI_Exec.ckptLvel == 4) { - sprintf(fn, "%s/%s", FTI_Conf.gTmpDir, FTI_Exec.ckptFile); - if (mkdir(FTI_Conf.gTmpDir, 0777) == -1) - FTI_Print("Cannot create global directory", FTI_EROR); + snprintf(FTI_Exec->ckptFile, FTI_BUFS, "Ckpt%d-Rank%d.fti", FTI_Exec->ckptID, FTI_Topo->myRank); + if (FTI_Ckpt[4].isInline && FTI_Exec->ckptLvel == 4) { + sprintf(fn, "%s/%s", FTI_Conf->gTmpDir, FTI_Exec->ckptFile); + if (mkdir(FTI_Conf->gTmpDir, 0777) == -1) + if (errno != EEXIST) { + FTI_Print("Cannot create global directory", FTI_EROR); + } } else { - sprintf(fn, "%s/%s", FTI_Conf.lTmpDir, FTI_Exec.ckptFile); - if (mkdir(FTI_Conf.lTmpDir, 0777) == -1) - FTI_Print("Cannot create local directory", FTI_EROR); + sprintf(fn, "%s/%s", FTI_Conf->lTmpDir, FTI_Exec->ckptFile); + if (mkdir(FTI_Conf->lTmpDir, 0777) == -1) + if (errno != EEXIST) { + FTI_Print("Cannot create local directory", FTI_EROR); + } } fd = fopen(fn, "wb"); @@ -92,7 +99,7 @@ int FTI_WriteCkpt(FTIT_dataset* FTI_Data) return FTI_NSCS; } - for (i = 0; i < FTI_Exec.nbVar; i++) { + for (i = 0; i < FTI_Exec->nbVar; i++) { if (fwrite(FTI_Data[i].ptr, FTI_Data[i].eleSize, FTI_Data[i].count, fd) != FTI_Data[i].count) { sprintf(str, "Dataset #%d could not be written.", FTI_Data[i].id); FTI_Print(str, FTI_EROR); @@ -116,8 +123,8 @@ int FTI_WriteCkpt(FTIT_dataset* FTI_Data) } sprintf(str, "Time writing checkpoint file : %f seconds.", MPI_Wtime() - tt); FTI_Print(str, FTI_DBUG); - int globalTmp = (FTI_Ckpt[4].isInline && FTI_Exec.ckptLvel == 4) ? 1 : 0; - res = FTI_Try(FTI_CreateMetadata(globalTmp), "create metadata."); + int globalTmp = (FTI_Ckpt[4].isInline && FTI_Exec->ckptLvel == 4) ? 1 : 0; + res = FTI_Try(FTI_CreateMetadata(FTI_Conf, FTI_Exec, FTI_Topo, globalTmp), "create metadata."); return res; } @@ -138,17 +145,18 @@ int FTI_WriteCkpt(FTIT_dataset* FTI_Data) **/ /*-------------------------------------------------------------------------*/ -int FTI_GroupClean(int level, int group, int pr) +int FTI_GroupClean(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo, + FTIT_checkpoint* FTI_Ckpt, int level, int group, int pr) { int i, rank; if (level == 0) { FTI_Print("Error postprocessing checkpoint. Discarding checkpoint...", FTI_WARN); } - rank = FTI_Topo.myRank; + rank = FTI_Topo->myRank; for (i = 0; i < pr; i++) { - if (FTI_Topo.amIaHead) - rank = FTI_Topo.body[i]; - FTI_Clean(level, i + group, rank); + if (FTI_Topo->amIaHead) + rank = FTI_Topo->body[i]; + FTI_Clean(FTI_Conf, FTI_Topo, FTI_Ckpt, level, i + group, rank); } return FTI_SCES; } @@ -169,63 +177,65 @@ int FTI_GroupClean(int level, int group, int pr) **/ /*-------------------------------------------------------------------------*/ -int FTI_PostCkpt(int group, int fo, int pr) +int FTI_PostCkpt(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, + int group, int fo, int pr) { - int i, tres, res, level, nodeFlag, globalFlag = FTI_Topo.splitRank; + int i, tres, res, level, nodeFlag, globalFlag = FTI_Topo->splitRank; double t0, t1, t2, t3; char str[FTI_BUFS]; t0 = MPI_Wtime(); - res = (FTI_Exec.ckptLvel == (FTI_REJW - FTI_BASE)) ? FTI_NSCS : FTI_SCES; + res = (FTI_Exec->ckptLvel == (FTI_REJW - FTI_BASE)) ? FTI_NSCS : FTI_SCES; MPI_Allreduce(&res, &tres, 1, MPI_INT, MPI_SUM, FTI_COMM_WORLD); if (tres != FTI_SCES) { - FTI_GroupClean(0, group, pr); + FTI_GroupClean(FTI_Conf, FTI_Topo, FTI_Ckpt, 0, group, pr); return FTI_NSCS; } t1 = MPI_Wtime(); for (i = 0; i < pr; i++) { - switch (FTI_Exec.ckptLvel) { + switch (FTI_Exec->ckptLvel) { case 4: - res += FTI_Flush(i + group, fo); + res += FTI_Flush(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, i + group, fo); break; case 3: - res += FTI_RSenc(i + group); + res += FTI_RSenc(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, i + group); break; case 2: - res += FTI_Ptner(i + group); + res += FTI_Ptner(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, i + group); break; case 1: - res += FTI_Local(i + group); + res += FTI_Local(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, i + group); break; } } MPI_Allreduce(&res, &tres, 1, MPI_INT, MPI_SUM, FTI_COMM_WORLD); if (tres != FTI_SCES) { - FTI_GroupClean(0, group, pr); + FTI_GroupClean(FTI_Conf, FTI_Topo, FTI_Ckpt, 0, group, pr); return FTI_NSCS; } t2 = MPI_Wtime(); - FTI_GroupClean(FTI_Exec.ckptLvel, group, pr); + FTI_GroupClean(FTI_Conf, FTI_Topo, FTI_Ckpt, FTI_Exec->ckptLvel, group, pr); MPI_Barrier(FTI_COMM_WORLD); - nodeFlag = (((!FTI_Topo.amIaHead) && (FTI_Topo.nodeRank == 0)) || (FTI_Topo.amIaHead)) ? 1 : 0; + nodeFlag = (((!FTI_Topo->amIaHead) && (FTI_Topo->nodeRank == 0)) || (FTI_Topo->amIaHead)) ? 1 : 0; if (nodeFlag) { - level = (FTI_Exec.ckptLvel != 4) ? FTI_Exec.ckptLvel : 1; - if (rename(FTI_Conf.lTmpDir, FTI_Ckpt[level].dir) == -1) + level = (FTI_Exec->ckptLvel != 4) ? FTI_Exec->ckptLvel : 1; + if (rename(FTI_Conf->lTmpDir, FTI_Ckpt[level].dir) == -1) FTI_Print("Cannot rename local directory", FTI_EROR); else FTI_Print("Local directory renamed", FTI_DBUG); } if (!globalFlag) { - if (FTI_Exec.ckptLvel == 4) { - if (rename(FTI_Conf.gTmpDir, FTI_Ckpt[FTI_Exec.ckptLvel].dir) == -1) + if (FTI_Exec->ckptLvel == 4) { + if (rename(FTI_Conf->gTmpDir, FTI_Ckpt[FTI_Exec->ckptLvel].dir) == -1) FTI_Print("Cannot rename global directory", FTI_EROR); } - if (rename(FTI_Conf.mTmpDir, FTI_Ckpt[FTI_Exec.ckptLvel].metaDir) == -1) + if (rename(FTI_Conf->mTmpDir, FTI_Ckpt[FTI_Exec->ckptLvel].metaDir) == -1) FTI_Print("Cannot rename meta directory", FTI_EROR); } @@ -250,7 +260,8 @@ int FTI_PostCkpt(int group, int fo, int pr) **/ /*-------------------------------------------------------------------------*/ -int FTI_Listen() +int FTI_Listen(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt) { MPI_Status status; char str[FTI_BUFS]; @@ -259,32 +270,32 @@ int FTI_Listen() flags[i] = 0; } FTI_Print("Head listening...", FTI_DBUG); - for (i = 0; i < FTI_Topo.nbApprocs; i++) { // Iterate on the application processes in the node - MPI_Recv(&buf, 1, MPI_INT, FTI_Topo.body[i], FTI_Conf.tag, FTI_Exec.globalComm, &status); + for (i = 0; i < FTI_Topo->nbApprocs; i++) { // Iterate on the application processes in the node + MPI_Recv(&buf, 1, MPI_INT, FTI_Topo->body[i], FTI_Conf->tag, FTI_Exec->globalComm, &status); sprintf(str, "The head received a %d message", buf); FTI_Print(str, FTI_DBUG); fflush(stdout); flags[buf - FTI_BASE] = flags[buf - FTI_BASE] + 1; } for (i = 1; i < 7; i++) { - if (flags[i] == FTI_Topo.nbApprocs) { // Determining checkpoint level - FTI_Exec.ckptLvel = i; + if (flags[i] == FTI_Topo->nbApprocs) { // Determining checkpoint level + FTI_Exec->ckptLvel = i; } } if (flags[6] > 0) { - FTI_Exec.ckptLvel = 6; + FTI_Exec->ckptLvel = 6; } - if (FTI_Exec.ckptLvel == 5) { // If we were asked to finalize + if (FTI_Exec->ckptLvel == 5) { // If we were asked to finalize return FTI_ENDW; } - res = FTI_Try(FTI_PostCkpt(1, 0, FTI_Topo.nbApprocs), "postprocess the checkpoint."); + res = FTI_Try(FTI_PostCkpt(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, 1, 0, FTI_Topo->nbApprocs), "postprocess the checkpoint."); if (res == FTI_SCES) { - FTI_Exec.wasLastOffline = 1; - FTI_Exec.lastCkptLvel = FTI_Exec.ckptLvel; - res = FTI_Exec.ckptLvel; + FTI_Exec->wasLastOffline = 1; + FTI_Exec->lastCkptLvel = FTI_Exec->ckptLvel; + res = FTI_Exec->ckptLvel; } - for (i = 0; i < FTI_Topo.nbApprocs; i++) { // Send msg. to avoid checkpoint collision - MPI_Send(&res, 1, MPI_INT, FTI_Topo.body[i], FTI_Conf.tag, FTI_Exec.globalComm); + for (i = 0; i < FTI_Topo->nbApprocs; i++) { // Send msg. to avoid checkpoint collision + MPI_Send(&res, 1, MPI_INT, FTI_Topo->body[i], FTI_Conf->tag, FTI_Exec->globalComm); } return FTI_SCES; } diff --git a/src/conf.c b/src/conf.c index a49677237..7ddb9355e 100644 --- a/src/conf.c +++ b/src/conf.c @@ -22,14 +22,14 @@ **/ /*-------------------------------------------------------------------------*/ -int FTI_UpdateConf(int restart) +int FTI_UpdateConf(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, int restart) { char str[FTI_BUFS]; dictionary* ini; // Load dictionary - ini = iniparser_load(FTI_Conf.cfgFile); - sprintf(str, "Updating configuration file (%s)...", FTI_Conf.cfgFile); + ini = iniparser_load(FTI_Conf->cfgFile); + sprintf(str, "Updating configuration file (%s)...", FTI_Conf->cfgFile); FTI_Print(str, FTI_DBUG); if (ini == NULL) { FTI_Print("Iniparser failed to parse the conf. file.", FTI_WARN); @@ -41,9 +41,9 @@ int FTI_UpdateConf(int restart) // Set failure to 'restart' iniparser_set(ini, "Restart:failure", str); // Set the exec. ID - iniparser_set(ini, "Restart:exec_id", FTI_Exec.id); + iniparser_set(ini, "Restart:exec_id", FTI_Exec->id); - FILE* fd = fopen(FTI_Conf.cfgFile, "w"); + FILE* fd = fopen(FTI_Conf->cfgFile, "w"); if (fd == NULL) { FTI_Print("FTI failed to open the configuration file.", FTI_EROR); @@ -86,18 +86,20 @@ int FTI_UpdateConf(int restart) **/ /*-------------------------------------------------------------------------*/ -int FTI_ReadConf(FTIT_injection* FTI_Inje) +int FTI_ReadConf(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, + FTIT_injection* FTI_Inje) { // Check access to FTI configuration file and load dictionary dictionary* ini; char *par, str[FTI_BUFS]; - sprintf(str, "Reading FTI configuration file (%s)...", FTI_Conf.cfgFile); + sprintf(str, "Reading FTI configuration file (%s)...", FTI_Conf->cfgFile); FTI_Print(str, FTI_INFO); - if (access(FTI_Conf.cfgFile, F_OK) != 0) { + if (access(FTI_Conf->cfgFile, F_OK) != 0) { FTI_Print("FTI configuration file NOT accessible.", FTI_WARN); return FTI_NSCS; } - ini = iniparser_load(FTI_Conf.cfgFile); + ini = iniparser_load(FTI_Conf->cfgFile); if (ini == NULL) { FTI_Print("Iniparser failed to parse the conf. file.", FTI_WARN); return FTI_NSCS; @@ -105,11 +107,11 @@ int FTI_ReadConf(FTIT_injection* FTI_Inje) // Setting/reading checkpoint configuration metadata par = iniparser_getstring(ini, "Basic:ckpt_dir", NULL); - snprintf(FTI_Conf.localDir, FTI_BUFS, "%s", par); + snprintf(FTI_Conf->localDir, FTI_BUFS, "%s", par); par = iniparser_getstring(ini, "Basic:glbl_dir", NULL); - snprintf(FTI_Conf.glbalDir, FTI_BUFS, "%s", par); + snprintf(FTI_Conf->glbalDir, FTI_BUFS, "%s", par); par = iniparser_getstring(ini, "Basic:meta_dir", NULL); - snprintf(FTI_Conf.metadDir, FTI_BUFS, "%s", par); + snprintf(FTI_Conf->metadDir, FTI_BUFS, "%s", par); FTI_Ckpt[1].ckptIntv = (int)iniparser_getint(ini, "Basic:ckpt_l1", -1); FTI_Ckpt[2].ckptIntv = (int)iniparser_getint(ini, "Basic:ckpt_l2", -1); FTI_Ckpt[3].ckptIntv = (int)iniparser_getint(ini, "Basic:ckpt_l3", -1); @@ -120,52 +122,52 @@ int FTI_ReadConf(FTIT_injection* FTI_Inje) FTI_Ckpt[4].isInline = (int)iniparser_getint(ini, "Basic:inline_l4", 1); // Reading/setting configuration metadata - FTI_Conf.verbosity = (int)iniparser_getint(ini, "Basic:verbosity", -1); - FTI_Conf.saveLastCkpt = (int)iniparser_getint(ini, "Basic:keep_last_ckpt", 0); - FTI_Conf.blockSize = (int)iniparser_getint(ini, "Advanced:block_size", -1) * 1024; - FTI_Conf.tag = (int)iniparser_getint(ini, "Advanced:mpi_tag", -1); - FTI_Conf.test = (int)iniparser_getint(ini, "Advanced:local_test", -1); - FTI_Conf.l3WordSize = FTI_WORD; + FTI_Conf->verbosity = (int)iniparser_getint(ini, "Basic:verbosity", -1); + FTI_Conf->saveLastCkpt = (int)iniparser_getint(ini, "Basic:keep_last_ckpt", 0); + FTI_Conf->blockSize = (int)iniparser_getint(ini, "Advanced:block_size", -1) * 1024; + FTI_Conf->tag = (int)iniparser_getint(ini, "Advanced:mpi_tag", -1); + FTI_Conf->test = (int)iniparser_getint(ini, "Advanced:local_test", -1); + FTI_Conf->l3WordSize = FTI_WORD; // Reading/setting execution metadata - FTI_Exec.nbVar = 0; - FTI_Exec.nbType = 0; - FTI_Exec.ckpt = 0; - FTI_Exec.ckptCnt = 0; - FTI_Exec.ckptIcnt = 0; - FTI_Exec.ckptID = 0; - FTI_Exec.ckptLvel = 0; - FTI_Exec.ckptIntv = 1; - FTI_Exec.wasLastOffline = 0; - FTI_Exec.ckptNext = 0; - FTI_Exec.ckptLast = 0; - FTI_Exec.syncIter = 1; - FTI_Exec.lastIterTime = 0; - FTI_Exec.totalIterTime = 0; - FTI_Exec.meanIterTime = 0; - FTI_Exec.reco = (int)iniparser_getint(ini, "restart:failure", 0); - if (FTI_Exec.reco == 0) { + FTI_Exec->nbVar = 0; + FTI_Exec->nbType = 0; + FTI_Exec->ckpt = 0; + FTI_Exec->ckptCnt = 0; + FTI_Exec->ckptIcnt = 0; + FTI_Exec->ckptID = 0; + FTI_Exec->ckptLvel = 0; + FTI_Exec->ckptIntv = 1; + FTI_Exec->wasLastOffline = 0; + FTI_Exec->ckptNext = 0; + FTI_Exec->ckptLast = 0; + FTI_Exec->syncIter = 1; + FTI_Exec->lastIterTime = 0; + FTI_Exec->totalIterTime = 0; + FTI_Exec->meanIterTime = 0; + FTI_Exec->reco = (int)iniparser_getint(ini, "restart:failure", 0); + if (FTI_Exec->reco == 0) { time_t tim = time(NULL); struct tm* n = localtime(&tim); - snprintf(FTI_Exec.id, FTI_BUFS, "%d-%02d-%02d_%02d-%02d-%02d", + snprintf(FTI_Exec->id, FTI_BUFS, "%d-%02d-%02d_%02d-%02d-%02d", n->tm_year + 1900, n->tm_mon + 1, n->tm_mday, n->tm_hour, n->tm_min, n->tm_sec); - MPI_Bcast(FTI_Exec.id, FTI_BUFS, MPI_CHAR, 0, FTI_Exec.globalComm); - sprintf(str, "The execution ID is: %s", FTI_Exec.id); + MPI_Bcast(FTI_Exec->id, FTI_BUFS, MPI_CHAR, 0, FTI_Exec->globalComm); + sprintf(str, "The execution ID is: %s", FTI_Exec->id); FTI_Print(str, FTI_INFO); } else { par = iniparser_getstring(ini, "restart:exec_id", NULL); - snprintf(FTI_Exec.id, FTI_BUFS, "%s", par); - sprintf(str, "This is a restart. The execution ID is: %s", FTI_Exec.id); + snprintf(FTI_Exec->id, FTI_BUFS, "%s", par); + sprintf(str, "This is a restart. The execution ID is: %s", FTI_Exec->id); FTI_Print(str, FTI_INFO); } // Reading/setting topology metadata - FTI_Topo.nbHeads = (int)iniparser_getint(ini, "Basic:head", 0); - FTI_Topo.groupSize = (int)iniparser_getint(ini, "Basic:group_size", -1); - FTI_Topo.nodeSize = (int)iniparser_getint(ini, "Basic:node_size", -1); - FTI_Topo.nbApprocs = FTI_Topo.nodeSize - FTI_Topo.nbHeads; - FTI_Topo.nbNodes = (FTI_Topo.nodeSize) ? FTI_Topo.nbProc / FTI_Topo.nodeSize : 0; + FTI_Topo->nbHeads = (int)iniparser_getint(ini, "Basic:head", 0); + FTI_Topo->groupSize = (int)iniparser_getint(ini, "Basic:group_size", -1); + FTI_Topo->nodeSize = (int)iniparser_getint(ini, "Basic:node_size", -1); + FTI_Topo->nbApprocs = FTI_Topo->nodeSize - FTI_Topo->nbHeads; + FTI_Topo->nbNodes = (FTI_Topo->nodeSize) ? FTI_Topo->nbProc / FTI_Topo->nodeSize : 0; // Reading/setting injection parameters FTI_Inje->rank = (int)iniparser_getint(ini, "Injection:rank", 0); @@ -175,7 +177,7 @@ int FTI_ReadConf(FTIT_injection* FTI_Inje) FTI_Inje->frequency = (int)iniparser_getint(ini, "Injection:frequency", -1); // Synchronize after config reading and free dictionary - MPI_Barrier(FTI_Exec.globalComm); + MPI_Barrier(FTI_Exec->globalComm); iniparser_freedict(ini); return FTI_SCES; } @@ -190,41 +192,42 @@ int FTI_ReadConf(FTIT_injection* FTI_Inje) **/ /*-------------------------------------------------------------------------*/ -int FTI_TestConfig() +int FTI_TestConfig(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo, + FTIT_checkpoint* FTI_Ckpt) { - if (FTI_Topo.nbHeads != 0 && FTI_Topo.nbHeads != 1) { + if (FTI_Topo->nbHeads != 0 && FTI_Topo->nbHeads != 1) { FTI_Print("The number of heads needs to be set to 0 or 1.", FTI_WARN); return FTI_NSCS; } - if (FTI_Topo.nbProc % FTI_Topo.nodeSize != 0) { + if (FTI_Topo->nbProc % FTI_Topo->nodeSize != 0) { FTI_Print("Number of ranks is not a multiple of the node size.", FTI_WARN); return FTI_NSCS; } - if (FTI_Topo.nbNodes % FTI_Topo.groupSize != 0) { + if (FTI_Topo->nbNodes % FTI_Topo->groupSize != 0) { FTI_Print("The number of nodes is not multiple of the group size.", FTI_WARN); return FTI_NSCS; } - if (FTI_Topo.groupSize <= 2) { + if (FTI_Topo->groupSize <= 2) { FTI_Print("The group size must be bigger than 2", FTI_WARN); return FTI_NSCS; } - if (FTI_Topo.groupSize >= 32) { + if (FTI_Topo->groupSize >= 32) { FTI_Print("The group size must be lower than 32", FTI_WARN); return FTI_NSCS; } - if (FTI_Conf.verbosity > 3 || FTI_Conf.verbosity < 1) { + if (FTI_Conf->verbosity > 3 || FTI_Conf->verbosity < 1) { FTI_Print("Verbosity needs to be set to 1, 2 or 3.", FTI_WARN); return FTI_NSCS; } - if (FTI_Conf.blockSize > (2048 * 1024) || FTI_Conf.blockSize < (1 * 1024)) { + if (FTI_Conf->blockSize > (2048 * 1024) || FTI_Conf->blockSize < (1 * 1024)) { FTI_Print("Block size needs to be set between 1 and 2048.", FTI_WARN); return FTI_NSCS; } - if (FTI_Conf.test != 0 && FTI_Conf.test != 1) { + if (FTI_Conf->test != 0 && FTI_Conf->test != 1) { FTI_Print("Local test size needs to be set to 0 or 1.", FTI_WARN); return FTI_NSCS; } - if (FTI_Conf.saveLastCkpt != 0 && FTI_Conf.saveLastCkpt != 1) { + if (FTI_Conf->saveLastCkpt != 0 && FTI_Conf->saveLastCkpt != 1) { FTI_Print("Keep last ckpt. needs to be set to 0 or 1.", FTI_WARN); return FTI_NSCS; } @@ -234,7 +237,7 @@ int FTI_TestConfig() FTI_Ckpt[l].ckptIntv = -1; if (FTI_Ckpt[l].isInline != 0 && FTI_Ckpt[l].isInline != 1) FTI_Ckpt[l].isInline = 1; - if (FTI_Ckpt[l].isInline == 0 && FTI_Topo.nbHeads != 1) { + if (FTI_Ckpt[l].isInline == 0 && FTI_Topo->nbHeads != 1) { FTI_Print("If inline is set to 0 then head should be set to 1.", FTI_WARN); return FTI_NSCS; } @@ -252,25 +255,25 @@ int FTI_TestConfig() **/ /*-------------------------------------------------------------------------*/ -int FTI_TestDirectories() +int FTI_TestDirectories(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo) { char str[FTI_BUFS]; // Checking local directory - sprintf(str, "Checking the local directory (%s)...", FTI_Conf.localDir); + sprintf(str, "Checking the local directory (%s)...", FTI_Conf->localDir); FTI_Print(str, FTI_DBUG); - if (mkdir(FTI_Conf.localDir, 0777) != 0) { + if (mkdir(FTI_Conf->localDir, 0777) == -1) { if (errno != EEXIST) { FTI_Print("The local directory could NOT be created.", FTI_WARN); return FTI_NSCS; } } - if (FTI_Topo.myRank == 0) { + if (FTI_Topo->myRank == 0) { // Checking metadata directory - sprintf(str, "Checking the metadata directory (%s)...", FTI_Conf.metadDir); + sprintf(str, "Checking the metadata directory (%s)...", FTI_Conf->metadDir); FTI_Print(str, FTI_DBUG); - if (mkdir(FTI_Conf.metadDir, 0777) != 0) { + if (mkdir(FTI_Conf->metadDir, 0777) == -1) { if (errno != EEXIST) { FTI_Print("The metadata directory could NOT be created.", FTI_WARN); return FTI_NSCS; @@ -278,9 +281,9 @@ int FTI_TestDirectories() } // Checking global directory - sprintf(str, "Checking the global directory (%s)...", FTI_Conf.glbalDir); + sprintf(str, "Checking the global directory (%s)...", FTI_Conf->glbalDir); FTI_Print(str, FTI_DBUG); - if (mkdir(FTI_Conf.glbalDir, 0777) != 0) { + if (mkdir(FTI_Conf->glbalDir, 0777) == -1) { if (errno != EEXIST) { FTI_Print("The global directory could NOT be created.", FTI_WARN); return FTI_NSCS; @@ -301,53 +304,54 @@ int FTI_TestDirectories() **/ /*-------------------------------------------------------------------------*/ -int FTI_CreateDirs() +int FTI_CreateDirs(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt) { char fn[FTI_BUFS]; // Create metadata timestamp directory - snprintf(fn, FTI_BUFS, "%s/%s", FTI_Conf.metadDir, FTI_Exec.id); + snprintf(fn, FTI_BUFS, "%s/%s", FTI_Conf->metadDir, FTI_Exec->id); if (mkdir(fn, 0777) == -1) { if (errno != EEXIST) FTI_Print("Cannot create metadata timestamp directory", FTI_EROR); } - snprintf(FTI_Conf.metadDir, FTI_BUFS, "%s", fn); - snprintf(FTI_Conf.mTmpDir, FTI_BUFS, "%s/tmp", fn); + snprintf(FTI_Conf->metadDir, FTI_BUFS, "%s", fn); + snprintf(FTI_Conf->mTmpDir, FTI_BUFS, "%s/tmp", fn); snprintf(FTI_Ckpt[1].metaDir, FTI_BUFS, "%s/l1", fn); snprintf(FTI_Ckpt[2].metaDir, FTI_BUFS, "%s/l2", fn); snprintf(FTI_Ckpt[3].metaDir, FTI_BUFS, "%s/l3", fn); snprintf(FTI_Ckpt[4].metaDir, FTI_BUFS, "%s/l4", fn); // Create global checkpoint timestamp directory - snprintf(fn, FTI_BUFS, "%s", FTI_Conf.glbalDir); - snprintf(FTI_Conf.glbalDir, FTI_BUFS, "%s/%s", fn, FTI_Exec.id); - if (mkdir(FTI_Conf.glbalDir, 0777) == -1) { + snprintf(fn, FTI_BUFS, "%s", FTI_Conf->glbalDir); + snprintf(FTI_Conf->glbalDir, FTI_BUFS, "%s/%s", fn, FTI_Exec->id); + if (mkdir(FTI_Conf->glbalDir, 0777) == -1) { if (errno != EEXIST) FTI_Print("Cannot create global checkpoint timestamp directory", FTI_EROR); } - snprintf(FTI_Conf.gTmpDir, FTI_BUFS, "%s/tmp", FTI_Conf.glbalDir); - snprintf(FTI_Ckpt[4].dir, FTI_BUFS, "%s/l4", FTI_Conf.glbalDir); + snprintf(FTI_Conf->gTmpDir, FTI_BUFS, "%s/tmp", FTI_Conf->glbalDir); + snprintf(FTI_Ckpt[4].dir, FTI_BUFS, "%s/l4", FTI_Conf->glbalDir); // Create local checkpoint timestamp directory - if (FTI_Conf.test) { // If local test generate name by topology - snprintf(fn, FTI_BUFS, "%s/node%d", FTI_Conf.localDir, FTI_Topo.myRank / FTI_Topo.nodeSize); + if (FTI_Conf->test) { // If local test generate name by topology + snprintf(fn, FTI_BUFS, "%s/node%d", FTI_Conf->localDir, FTI_Topo->myRank / FTI_Topo->nodeSize); if (mkdir(fn, 0777) == -1) { if (errno != EEXIST) FTI_Print("Cannot create local checkpoint timestamp directory", FTI_EROR); } } else { - snprintf(fn, FTI_BUFS, "%s", FTI_Conf.localDir); + snprintf(fn, FTI_BUFS, "%s", FTI_Conf->localDir); } - snprintf(FTI_Conf.localDir, FTI_BUFS, "%s/%s", fn, FTI_Exec.id); - if (mkdir(FTI_Conf.localDir, 0777) == -1) { + snprintf(FTI_Conf->localDir, FTI_BUFS, "%s/%s", fn, FTI_Exec->id); + if (mkdir(FTI_Conf->localDir, 0777) == -1) { if (errno != EEXIST) FTI_Print("Cannot create local checkpoint timestamp directory", FTI_EROR); } - snprintf(FTI_Conf.lTmpDir, FTI_BUFS, "%s/tmp", FTI_Conf.localDir); - snprintf(FTI_Ckpt[1].dir, FTI_BUFS, "%s/l1", FTI_Conf.localDir); - snprintf(FTI_Ckpt[2].dir, FTI_BUFS, "%s/l2", FTI_Conf.localDir); - snprintf(FTI_Ckpt[3].dir, FTI_BUFS, "%s/l3", FTI_Conf.localDir); + snprintf(FTI_Conf->lTmpDir, FTI_BUFS, "%s/tmp", FTI_Conf->localDir); + snprintf(FTI_Ckpt[1].dir, FTI_BUFS, "%s/l1", FTI_Conf->localDir); + snprintf(FTI_Ckpt[2].dir, FTI_BUFS, "%s/l2", FTI_Conf->localDir); + snprintf(FTI_Ckpt[3].dir, FTI_BUFS, "%s/l3", FTI_Conf->localDir); return FTI_SCES; } @@ -362,25 +366,27 @@ int FTI_CreateDirs() **/ /*-------------------------------------------------------------------------*/ -int FTI_LoadConf(FTIT_injection* FTI_Inje) +int FTI_LoadConf(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, + FTIT_injection *FTI_Inje) { int res; - res = FTI_Try(FTI_ReadConf(FTI_Inje), "read configuration."); + res = FTI_Try(FTI_ReadConf(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, FTI_Inje), "read configuration."); if (res == FTI_NSCS) { FTI_Print("Impossible to read configuration.", FTI_WARN); return FTI_NSCS; } - res = FTI_Try(FTI_TestConfig(), "pass the configuration test."); + res = FTI_Try(FTI_TestConfig(FTI_Conf, FTI_Topo, FTI_Ckpt), "pass the configuration test."); if (res == FTI_NSCS) { FTI_Print("Wrong configuration.", FTI_WARN); return FTI_NSCS; } - res = FTI_Try(FTI_TestDirectories(), "pass the directories test."); + res = FTI_Try(FTI_TestDirectories(FTI_Conf, FTI_Topo), "pass the directories test."); if (res == FTI_NSCS) { FTI_Print("Problem with the directories.", FTI_WARN); return FTI_NSCS; } - res = FTI_Try(FTI_CreateDirs(), "create checkpoint directories."); + res = FTI_Try(FTI_CreateDirs(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt), "create checkpoint directories."); if (res == FTI_NSCS) { FTI_Print("Problem creating the directories.", FTI_WARN); return FTI_NSCS; diff --git a/src/interface.h b/src/interface.h index f19de8396..403553081 100644 --- a/src/interface.h +++ b/src/interface.h @@ -38,50 +38,86 @@ void FTI_Abort(); int FTI_FloatBitFlip(float *target, int bit); int FTI_DoubleBitFlip(double *target, int bit); +void FTI_Print(char *msg, int priority); -int FTI_UpdateIterTime(); -int FTI_WriteCkpt(FTIT_dataset* FTI_Data); -int FTI_GroupClean(int level, int group, int pr); -int FTI_PostCkpt(int group, int fo, int pr); -int FTI_Listen(); - -int FTI_UpdateConf(int restart); -int FTI_ReadConf(FTIT_injection *FTI_Inje); -int FTI_TestConfig(); -int FTI_TestDirectories(); -int FTI_CreateDirs(); -int FTI_LoadConf(FTIT_injection *FTI_Inje); - -int FTI_GetMeta(unsigned long *fs, unsigned long *mfs, int group, int level); -int FTI_WriteMetadata(unsigned long *fs, unsigned long mfs, char* fnl); -int FTI_CreateMetadata(int globalTmp); - -int FTI_Local(int group); -int FTI_Ptner(int group); -int FTI_RSenc(int group); -int FTI_Flush(int group, int level); - -int FTI_Decode(int fs, int maxFs, int *erased); -int FTI_RecoverL1(int group); -int FTI_RecoverL2(int group); -int FTI_RecoverL3(int group); -int FTI_RecoverL4(int group); +int FTI_UpdateIterTime(FTIT_execution* FTI_Exec); +int FTI_WriteCkpt(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, + FTIT_dataset* FTI_Data); +int FTI_GroupClean(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo, + FTIT_checkpoint* FTI_Ckpt, int level, int group, int pr); +int FTI_PostCkpt(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, + int group, int fo, int pr); +int FTI_Listen(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt); + +int FTI_UpdateConf(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + int restart); +int FTI_ReadConf(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, + FTIT_injection *FTI_Inje); +int FTI_TestConfig(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo, + FTIT_checkpoint* FTI_Ckpt); +int FTI_TestDirectories(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo); +int FTI_CreateDirs(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt); +int FTI_LoadConf(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, + FTIT_injection *FTI_Inje); + +int FTI_GetMeta(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, + unsigned long *fs, unsigned long *mfs, int group, int level); +int FTI_WriteMetadata(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo, + unsigned long *fs, unsigned long mfs, char* fnl); +int FTI_CreateMetadata(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, int globalTmp); + +int FTI_Local(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, int group); +int FTI_Ptner(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, int group); +int FTI_RSenc(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, int group); +int FTI_Flush(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, int group, int level); + +int FTI_Decode(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, + int fs, int maxFs, int *erased); +int FTI_RecoverL1(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, int group); +int FTI_RecoverL2(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, int group); +int FTI_RecoverL3(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, int group); +int FTI_RecoverL4(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, int group); int FTI_CheckFile(char *fn, unsigned long fs); -int FTI_CheckErasures(unsigned long *fs, unsigned long *maxFs, int group, int *erased, int level); -int FTI_RecoverFiles(); +int FTI_CheckErasures(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, + unsigned long *fs, unsigned long *maxFs, int group, + int *erased, int level); +int FTI_RecoverFiles(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt); -int FTI_Clean(int level, int group, int rank); -void FTI_Print(char *msg, int priority); int FTI_Try(int result, char* message); -int FTI_InitBasicTypes(FTIT_dataset FTI_Data[FTI_BUFS]); +int FTI_InitBasicTypes(FTIT_dataset* FTI_Data); int FTI_RmDir(char path[FTI_BUFS], int flag); -int FTI_Clean(int level, int group, int rank); - -int FTI_SaveTopo(char *nameList); -int FTI_ReorderNodes(int *nodeList, char *nameList); -int FTI_BuildNodeList(int *nodeList, char *nameList); -int FTI_CreateComms(int *userProcList, int *distProcList, int* nodeList); -int FTI_Topology(); +int FTI_Clean(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo, + FTIT_checkpoint* FTI_Ckpt, int level, int group, int rank); + +int FTI_SaveTopo(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo, char *nameList); +int FTI_ReorderNodes(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo, + int *nodeList, char *nameList); +int FTI_BuildNodeList(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, int *nodeList, char *nameList); +int FTI_CreateComms(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, int *userProcList, + int *distProcList, int* nodeList); +int FTI_Topology(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo); #endif diff --git a/src/meta.c b/src/meta.c index daae505c2..d0da19e17 100644 --- a/src/meta.c +++ b/src/meta.c @@ -22,16 +22,18 @@ **/ /*-------------------------------------------------------------------------*/ -int FTI_GetMeta(unsigned long* fs, unsigned long* mfs, int group, int level) +int FTI_GetMeta(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, + unsigned long* fs, unsigned long* mfs, int group, int level) { dictionary* ini; int res = -1, cnt = 3; char mfn[FTI_BUFS], str[FTI_BUFS], *cfn; if (level == 0) { - sprintf(mfn, "%s/sector%d-group%d.fti", FTI_Conf.mTmpDir, FTI_Topo.sectorID, group); + sprintf(mfn, "%s/sector%d-group%d.fti", FTI_Conf->mTmpDir, FTI_Topo->sectorID, group); } else { - sprintf(mfn, "%s/sector%d-group%d.fti", FTI_Ckpt[level].metaDir, FTI_Topo.sectorID, group); + sprintf(mfn, "%s/sector%d-group%d.fti", FTI_Ckpt[level].metaDir, FTI_Topo->sectorID, group); } sprintf(str, "Getting FTI metadata file (%s)...", mfn); FTI_Print(str, FTI_DBUG); @@ -49,12 +51,12 @@ int FTI_GetMeta(unsigned long* fs, unsigned long* mfs, int group, int level) FTI_Print("Iniparser failed to parse the metadata file.", FTI_WARN); return FTI_NSCS; } - sprintf(str, "%d:Ckpt_file_name", FTI_Topo.groupRank); + sprintf(str, "%d:Ckpt_file_name", FTI_Topo->groupRank); cfn = iniparser_getstring(ini, str, NULL); - snprintf(FTI_Exec.ckptFile, FTI_BUFS, "%s", cfn); - sprintf(str, "%d:Ckpt_file_size", FTI_Topo.groupRank); + snprintf(FTI_Exec->ckptFile, FTI_BUFS, "%s", cfn); + sprintf(str, "%d:Ckpt_file_size", FTI_Topo->groupRank); *fs = (int)iniparser_getint(ini, str, -1); - sprintf(str, "%d:Ckpt_file_maxs", FTI_Topo.groupRank); + sprintf(str, "%d:Ckpt_file_maxs", FTI_Topo->groupRank); *mfs = (int)iniparser_getint(ini, str, -1); iniparser_freedict(ini); return FTI_SCES; @@ -73,13 +75,14 @@ int FTI_GetMeta(unsigned long* fs, unsigned long* mfs, int group, int level) **/ /*-------------------------------------------------------------------------*/ -int FTI_WriteMetadata(unsigned long* fs, unsigned long mfs, char* fnl) +int FTI_WriteMetadata(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo, + unsigned long* fs, unsigned long mfs, char* fnl) { char str[FTI_BUFS], buf[FTI_BUFS]; dictionary* ini; int i; - snprintf(buf, FTI_BUFS, "%s/Topology.fti", FTI_Conf.metadDir); + snprintf(buf, FTI_BUFS, "%s/Topology.fti", FTI_Conf->metadDir); sprintf(str, "Temporary load of topology file (%s)...", buf); FTI_Print(str, FTI_DBUG); @@ -92,7 +95,7 @@ int FTI_WriteMetadata(unsigned long* fs, unsigned long mfs, char* fnl) } // Add metadata to dictionary - for (i = 0; i < FTI_Topo.groupSize; i++) { + for (i = 0; i < FTI_Topo->groupSize; i++) { strncpy(buf, fnl + (i * FTI_BUFS), FTI_BUFS - 1); sprintf(str, "%d", i); iniparser_set(ini, str, NULL); @@ -108,14 +111,15 @@ int FTI_WriteMetadata(unsigned long* fs, unsigned long mfs, char* fnl) // Remove topology section iniparser_unset(ini, "topology"); - if (mkdir(FTI_Conf.mTmpDir, 0777) == -1) { + if (mkdir(FTI_Conf->mTmpDir, 0777) == -1) { if (errno != EEXIST) FTI_Print("Cannot create directory", FTI_EROR); } - sprintf(buf, "%s/sector%d-group%d.fti", FTI_Conf.mTmpDir, FTI_Topo.sectorID, FTI_Topo.groupID); + sprintf(buf, "%s/sector%d-group%d.fti", FTI_Conf->mTmpDir, FTI_Topo->sectorID, FTI_Topo->groupID); if (remove(buf) == -1) - FTI_Print("Cannot remove sector-group.fti", FTI_EROR); + if (errno != ENOENT) + FTI_Print("Cannot remove sector-group.fti", FTI_EROR); sprintf(str, "Creating metadata file (%s)...", buf); FTI_Print(str, FTI_DBUG); @@ -164,44 +168,45 @@ int FTI_WriteMetadata(unsigned long* fs, unsigned long mfs, char* fnl) **/ /*-------------------------------------------------------------------------*/ -int FTI_CreateMetadata(int globalTmp) +int FTI_CreateMetadata(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, int globalTmp) { - char* fnl = talloc(char, FTI_Topo.groupSize* FTI_BUFS); + char* fnl = talloc(char, FTI_Topo->groupSize* FTI_BUFS); unsigned long fs[FTI_BUFS], mfs, tmpo; char str[FTI_BUFS], buf[FTI_BUFS]; struct stat fileStatus; int i; if (globalTmp) { - sprintf(buf, "%s/%s", FTI_Conf.gTmpDir, FTI_Exec.ckptFile); + sprintf(buf, "%s/%s", FTI_Conf->gTmpDir, FTI_Exec->ckptFile); } else { - sprintf(buf, "%s/%s", FTI_Conf.lTmpDir, FTI_Exec.ckptFile); + sprintf(buf, "%s/%s", FTI_Conf->lTmpDir, FTI_Exec->ckptFile); } if (stat(buf, &fileStatus) == 0) { // Getting size of files - fs[FTI_Topo.groupRank] = (unsigned long)fileStatus.st_size; + fs[FTI_Topo->groupRank] = (unsigned long)fileStatus.st_size; } else { FTI_Print("Error with stat on the checkpoint file.", FTI_WARN); free(fnl); return FTI_NSCS; } - sprintf(str, "Checkpoint file size : %ld bytes.", fs[FTI_Topo.groupRank]); + sprintf(str, "Checkpoint file size : %ld bytes.", fs[FTI_Topo->groupRank]); FTI_Print(str, FTI_DBUG); - sprintf(fnl + (FTI_Topo.groupRank * FTI_BUFS), "%s", FTI_Exec.ckptFile); - tmpo = fs[FTI_Topo.groupRank]; // Gather all the file sizes - MPI_Allgather(&tmpo, 1, MPI_UNSIGNED_LONG, fs, 1, MPI_UNSIGNED_LONG, FTI_Exec.groupComm); - strncpy(str, fnl + (FTI_Topo.groupRank * FTI_BUFS), FTI_BUFS - 1); // Gather all the file names - MPI_Allgather(str, FTI_BUFS, MPI_CHAR, fnl, FTI_BUFS, MPI_CHAR, FTI_Exec.groupComm); + sprintf(fnl + (FTI_Topo->groupRank * FTI_BUFS), "%s", FTI_Exec->ckptFile); + tmpo = fs[FTI_Topo->groupRank]; // Gather all the file sizes + MPI_Allgather(&tmpo, 1, MPI_UNSIGNED_LONG, fs, 1, MPI_UNSIGNED_LONG, FTI_Exec->groupComm); + strncpy(str, fnl + (FTI_Topo->groupRank * FTI_BUFS), FTI_BUFS - 1); // Gather all the file names + MPI_Allgather(str, FTI_BUFS, MPI_CHAR, fnl, FTI_BUFS, MPI_CHAR, FTI_Exec->groupComm); mfs = 0; - for (i = 0; i < FTI_Topo.groupSize; i++) { + for (i = 0; i < FTI_Topo->groupSize; i++) { if (fs[i] > mfs) { mfs = fs[i]; // Search max. size } } sprintf(str, "Max. file size %ld.", mfs); FTI_Print(str, FTI_DBUG); - if (FTI_Topo.groupRank == 0) { // Only one process in the group create the metadata - int res = FTI_Try(FTI_WriteMetadata(fs, mfs, fnl), "write the metadata."); + if (FTI_Topo->groupRank == 0) { // Only one process in the group create the metadata + int res = FTI_Try(FTI_WriteMetadata(FTI_Conf, FTI_Topo, fs, mfs, fnl), "write the metadata."); if (res == FTI_NSCS) { free(fnl); return FTI_NSCS; diff --git a/src/postckpt.c b/src/postckpt.c index b61f13483..f66bedd7d 100644 --- a/src/postckpt.c +++ b/src/postckpt.c @@ -17,11 +17,12 @@ **/ /*-------------------------------------------------------------------------*/ -int FTI_Local(int group) +int FTI_Local(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, int group) { unsigned long maxFs, fs; FTI_Print("Starting checkpoint post-processing L1", FTI_DBUG); - int res = FTI_Try(FTI_GetMeta(&fs, &maxFs, group, 0), "obtain metadata."); + int res = FTI_Try(FTI_GetMeta(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, &fs, &maxFs, group, 0), "obtain metadata."); if (res == FTI_NSCS) return FTI_NSCS; return FTI_SCES; @@ -39,34 +40,35 @@ int FTI_Local(int group) **/ /*-------------------------------------------------------------------------*/ -int FTI_Ptner(int group) +int FTI_Ptner(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, int group) { char *blBuf1, *blBuf2, lfn[FTI_BUFS], pfn[FTI_BUFS], str[FTI_BUFS]; unsigned long maxFs, fs, ps, pos = 0; MPI_Request reqSend, reqRecv; FILE *lfd, *pfd; - int res, dest, src, bSize = FTI_Conf.blockSize; + int res, dest, src, bSize = FTI_Conf->blockSize; MPI_Status status; FTI_Print("Starting checkpoint post-processing L2", FTI_DBUG); - res = FTI_Try(FTI_GetMeta(&fs, &maxFs, group, 0), "obtain metadata."); + res = FTI_Try(FTI_GetMeta(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, &fs, &maxFs, group, 0), "obtain metadata."); if (res == FTI_NSCS) return FTI_NSCS; - ps = (maxFs / FTI_Conf.blockSize) * FTI_Conf.blockSize; + ps = (maxFs / FTI_Conf->blockSize) * FTI_Conf->blockSize; if (ps < maxFs) - ps = ps + FTI_Conf.blockSize; + ps = ps + FTI_Conf->blockSize; sprintf(str, "Max. file size %ld and padding size %ld.", maxFs, ps); FTI_Print(str, FTI_DBUG); - sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &src); - sprintf(lfn, "%s/%s", FTI_Conf.lTmpDir, FTI_Exec.ckptFile); - sprintf(pfn, "%s/Ckpt%d-Pcof%d.fti", FTI_Conf.lTmpDir, FTI_Exec.ckptID, src); + sscanf(FTI_Exec->ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec->ckptID, &src); + sprintf(lfn, "%s/%s", FTI_Conf->lTmpDir, FTI_Exec->ckptFile); + sprintf(pfn, "%s/Ckpt%d-Pcof%d.fti", FTI_Conf->lTmpDir, FTI_Exec->ckptID, src); sprintf(str, "L2 trying to access local ckpt. file (%s).", lfn); FTI_Print(str, FTI_DBUG); - dest = FTI_Topo.right; - src = FTI_Topo.left; + dest = FTI_Topo->right; + src = FTI_Topo->left; lfd = fopen(lfn, "rb"); if (lfd == NULL) { @@ -81,11 +83,11 @@ int FTI_Ptner(int group) return FTI_NSCS; } - blBuf1 = talloc(char, FTI_Conf.blockSize); - blBuf2 = talloc(char, FTI_Conf.blockSize); + blBuf1 = talloc(char, FTI_Conf->blockSize); + blBuf2 = talloc(char, FTI_Conf->blockSize); // Checkpoint files partner copy while (pos < ps) { - if ((fs - pos) < FTI_Conf.blockSize) + if ((fs - pos) < FTI_Conf->blockSize) bSize = fs - pos; size_t bytes = fread(blBuf1, sizeof(char), bSize, lfd); @@ -100,8 +102,8 @@ int FTI_Ptner(int group) return FTI_NSCS; } - MPI_Isend(blBuf1, bytes, MPI_CHAR, dest, FTI_Conf.tag, FTI_Exec.groupComm, &reqSend); - MPI_Irecv(blBuf2, FTI_Conf.blockSize, MPI_CHAR, src, FTI_Conf.tag, FTI_Exec.groupComm, &reqRecv); + MPI_Isend(blBuf1, bytes, MPI_CHAR, dest, FTI_Conf->tag, FTI_Exec->groupComm, &reqSend); + MPI_Irecv(blBuf2, FTI_Conf->blockSize, MPI_CHAR, src, FTI_Conf->tag, FTI_Exec->groupComm, &reqRecv); MPI_Wait(&reqSend, &status); MPI_Wait(&reqRecv, &status); @@ -117,7 +119,7 @@ int FTI_Ptner(int group) return FTI_NSCS; } - pos = pos + FTI_Conf.blockSize; + pos = pos + FTI_Conf->blockSize; } free(blBuf1); @@ -140,10 +142,11 @@ int FTI_Ptner(int group) **/ /*-------------------------------------------------------------------------*/ -int FTI_RSenc(int group) +int FTI_RSenc(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, int group) { char *myData, *data, *coding, lfn[FTI_BUFS], efn[FTI_BUFS], str[FTI_BUFS]; - int *matrix, cnt, i, j, init, src, offset, dest, matVal, res, bs = FTI_Conf.blockSize; + int *matrix, cnt, i, j, init, src, offset, dest, matVal, res, bs = FTI_Conf->blockSize; unsigned long maxFs, fs, ps, pos = 0; MPI_Request reqSend, reqRecv; MPI_Status status; @@ -151,16 +154,16 @@ int FTI_RSenc(int group) FILE *lfd, *efd; FTI_Print("Starting checkpoint post-processing L3", FTI_DBUG); - res = FTI_Try(FTI_GetMeta(&fs, &maxFs, group, 0), "obtain metadata."); + res = FTI_Try(FTI_GetMeta(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, &fs, &maxFs, group, 0), "obtain metadata."); if (res != FTI_SCES) return FTI_NSCS; ps = ((maxFs / bs)) * bs; if (ps < maxFs) ps = ps + bs; - sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &i); - sprintf(lfn, "%s/%s", FTI_Conf.lTmpDir, FTI_Exec.ckptFile); - sprintf(efn, "%s/Ckpt%d-RSed%d.fti", FTI_Conf.lTmpDir, FTI_Exec.ckptID, i); + sscanf(FTI_Exec->ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec->ckptID, &i); + sprintf(lfn, "%s/%s", FTI_Conf->lTmpDir, FTI_Exec->ckptFile); + sprintf(efn, "%s/Ckpt%d-RSed%d.fti", FTI_Conf->lTmpDir, FTI_Exec->ckptID, i); sprintf(str, "L3 trying to access local ckpt. file (%s).", lfn); FTI_Print(str, FTI_DBUG); @@ -181,11 +184,11 @@ int FTI_RSenc(int group) myData = talloc(char, bs); coding = talloc(char, bs); data = talloc(char, 2 * bs); - matrix = talloc(int, FTI_Topo.groupSize* FTI_Topo.groupSize); + matrix = talloc(int, FTI_Topo->groupSize* FTI_Topo->groupSize); - for (i = 0; i < FTI_Topo.groupSize; i++) { - for (j = 0; j < FTI_Topo.groupSize; j++) { - matrix[i * FTI_Topo.groupSize + j] = galois_single_divide(1, i ^ (FTI_Topo.groupSize + j), FTI_Conf.l3WordSize); + for (i = 0; i < FTI_Topo->groupSize; i++) { + for (j = 0; j < FTI_Topo->groupSize; j++) { + matrix[i * FTI_Topo->groupSize + j] = galois_single_divide(1, i ^ (FTI_Topo->groupSize + j), FTI_Conf->l3WordSize); } } @@ -210,14 +213,14 @@ int FTI_RSenc(int group) return FTI_NSCS; } - dest = FTI_Topo.groupRank; - i = FTI_Topo.groupRank; + dest = FTI_Topo->groupRank; + i = FTI_Topo->groupRank; offset = 0; init = 0; cnt = 0; // For each encoding - while (cnt < FTI_Topo.groupSize) { + while (cnt < FTI_Topo->groupSize) { if (cnt == 0) { memcpy(&(data[offset * bs]), myData, sizeof(char) * bytes); } @@ -227,14 +230,14 @@ int FTI_RSenc(int group) } // At every loop *but* the last one we send the data - if (cnt != FTI_Topo.groupSize - 1) { - dest = (dest + FTI_Topo.groupSize - 1) % FTI_Topo.groupSize; - src = (i + 1) % FTI_Topo.groupSize; - MPI_Isend(myData, bytes, MPI_CHAR, dest, FTI_Conf.tag, FTI_Exec.groupComm, &reqSend); - MPI_Irecv(&(data[(1 - offset) * bs]), bs, MPI_CHAR, src, FTI_Conf.tag, FTI_Exec.groupComm, &reqRecv); + if (cnt != FTI_Topo->groupSize - 1) { + dest = (dest + FTI_Topo->groupSize - 1) % FTI_Topo->groupSize; + src = (i + 1) % FTI_Topo->groupSize; + MPI_Isend(myData, bytes, MPI_CHAR, dest, FTI_Conf->tag, FTI_Exec->groupComm, &reqSend); + MPI_Irecv(&(data[(1 - offset) * bs]), bs, MPI_CHAR, src, FTI_Conf->tag, FTI_Exec->groupComm, &reqRecv); } - matVal = matrix[FTI_Topo.groupRank * FTI_Topo.groupSize + i]; + matVal = matrix[FTI_Topo->groupRank * FTI_Topo->groupSize + i]; // First copy or xor any data that does not need to be multiplied by a factor if (matVal == 1) { if (init == 0) { @@ -252,7 +255,7 @@ int FTI_RSenc(int group) init = 1; } - i = (i + 1) % FTI_Topo.groupSize; + i = (i + 1) % FTI_Topo->groupSize; offset = 1 - offset; cnt++; } @@ -286,7 +289,8 @@ int FTI_RSenc(int group) **/ /*-------------------------------------------------------------------------*/ -int FTI_Flush(int group, int level) +int FTI_Flush(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, int group, int level) { char lfn[FTI_BUFS], gfn[FTI_BUFS], str[FTI_BUFS]; unsigned long maxFs, fs, ps, pos = 0; @@ -295,35 +299,35 @@ int FTI_Flush(int group, int level) return FTI_SCES; // Fake call for inline PFS checkpoint FTI_Print("Starting checkpoint post-processing L4", FTI_DBUG); - int res = FTI_Try(FTI_GetMeta(&fs, &maxFs, group, level), "obtain metadata."); + int res = FTI_Try(FTI_GetMeta(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, &fs, &maxFs, group, level), "obtain metadata."); if (res != FTI_SCES) return FTI_NSCS; - if (mkdir(FTI_Conf.gTmpDir, 0777) == -1) { + if (mkdir(FTI_Conf->gTmpDir, 0777) == -1) { if (errno != EEXIST) FTI_Print("Cannot create directory", FTI_EROR); } - ps = (maxFs / FTI_Conf.blockSize) * FTI_Conf.blockSize; + ps = (maxFs / FTI_Conf->blockSize) * FTI_Conf->blockSize; if (ps < maxFs) - ps = ps + FTI_Conf.blockSize; + ps = ps + FTI_Conf->blockSize; switch (level) { case 0: - sprintf(lfn, "%s/%s", FTI_Conf.lTmpDir, FTI_Exec.ckptFile); + sprintf(lfn, "%s/%s", FTI_Conf->lTmpDir, FTI_Exec->ckptFile); break; case 1: - sprintf(lfn, "%s/%s", FTI_Ckpt[1].dir, FTI_Exec.ckptFile); + sprintf(lfn, "%s/%s", FTI_Ckpt[1].dir, FTI_Exec->ckptFile); break; case 2: - sprintf(lfn, "%s/%s", FTI_Ckpt[2].dir, FTI_Exec.ckptFile); + sprintf(lfn, "%s/%s", FTI_Ckpt[2].dir, FTI_Exec->ckptFile); break; case 3: - sprintf(lfn, "%s/%s", FTI_Ckpt[3].dir, FTI_Exec.ckptFile); + sprintf(lfn, "%s/%s", FTI_Ckpt[3].dir, FTI_Exec->ckptFile); break; } // Open and resize files - sprintf(gfn, "%s/%s", FTI_Conf.gTmpDir, FTI_Exec.ckptFile); + sprintf(gfn, "%s/%s", FTI_Conf->gTmpDir, FTI_Exec->ckptFile); sprintf(str, "L4 trying to access local ckpt. file (%s).", lfn); FTI_Print(str, FTI_DBUG); @@ -343,12 +347,12 @@ int FTI_Flush(int group, int level) return FTI_NSCS; } - char *blBuf1 = talloc(char, FTI_Conf.blockSize); - unsigned long bSize = FTI_Conf.blockSize; + char *blBuf1 = talloc(char, FTI_Conf->blockSize); + unsigned long bSize = FTI_Conf->blockSize; // Checkpoint files exchange while (pos < ps) { - if ((fs - pos) < FTI_Conf.blockSize) + if ((fs - pos) < FTI_Conf->blockSize) bSize = fs - pos; size_t bytes = fread(blBuf1, sizeof(char), bSize, lfd); @@ -375,7 +379,7 @@ int FTI_Flush(int group, int level) return FTI_NSCS; } - pos = pos + FTI_Conf.blockSize; + pos = pos + FTI_Conf->blockSize; } free(blBuf1); diff --git a/src/postreco.c b/src/postreco.c index c89932a51..58ea7ee32 100644 --- a/src/postreco.c +++ b/src/postreco.c @@ -17,42 +17,44 @@ **/ /*-------------------------------------------------------------------------*/ -int FTI_Decode(int fs, int maxFs, int* erased) +int FTI_Decode(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, + int fs, int maxFs, int* erased) { int *matrix, *decMatrix, *dm_ids, *tmpmat, i, j, k, m, ps, bs, pos = 0; char **coding, **data, *dataTmp, fn[FTI_BUFS], efn[FTI_BUFS], str[FTI_BUFS]; FILE *fd, *efd; - bs = FTI_Conf.blockSize; - k = FTI_Topo.groupSize; + bs = FTI_Conf->blockSize; + k = FTI_Topo->groupSize; m = k; - ps = ((maxFs / FTI_Conf.blockSize)) * FTI_Conf.blockSize; + ps = ((maxFs / FTI_Conf->blockSize)) * FTI_Conf->blockSize; if (ps < maxFs) - ps = ps + FTI_Conf.blockSize; // Calculating padding size + ps = ps + FTI_Conf->blockSize; // Calculating padding size if (mkdir(FTI_Ckpt[3].dir, 0777) == -1) if (errno != EEXIST) FTI_Print("Cannot create directory", FTI_EROR); - sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &i); - sprintf(fn, "%s/%s", FTI_Ckpt[3].dir, FTI_Exec.ckptFile); - sprintf(efn, "%s/Ckpt%d-RSed%d.fti", FTI_Ckpt[3].dir, FTI_Exec.ckptID, i); + sscanf(FTI_Exec->ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec->ckptID, &i); + sprintf(fn, "%s/%s", FTI_Ckpt[3].dir, FTI_Exec->ckptFile); + sprintf(efn, "%s/Ckpt%d-RSed%d.fti", FTI_Ckpt[3].dir, FTI_Exec->ckptID, i); data = talloc(char*, k); coding = talloc(char*, m); - dataTmp = talloc(char, FTI_Conf.blockSize* k); + dataTmp = talloc(char, FTI_Conf->blockSize* k); dm_ids = talloc(int, k); decMatrix = talloc(int, k* k); tmpmat = talloc(int, k* k); matrix = talloc(int, k* k); - for (i = 0; i < FTI_Topo.groupSize; i++) { - for (j = 0; j < FTI_Topo.groupSize; j++) { - matrix[i * FTI_Topo.groupSize + j] = galois_single_divide(1, i ^ (FTI_Topo.groupSize + j), FTI_Conf.l3WordSize); + for (i = 0; i < FTI_Topo->groupSize; i++) { + for (j = 0; j < FTI_Topo->groupSize; j++) { + matrix[i * FTI_Topo->groupSize + j] = galois_single_divide(1, i ^ (FTI_Topo->groupSize + j), FTI_Conf->l3WordSize); } } for (i = 0; i < m; i++) { - coding[i] = talloc(char, FTI_Conf.blockSize); - data[i] = talloc(char, FTI_Conf.blockSize); + coding[i] = talloc(char, FTI_Conf->blockSize); + data[i] = talloc(char, FTI_Conf->blockSize); } j = 0; for (i = 0; j < k; i++) { @@ -74,7 +76,7 @@ int FTI_Decode(int fs, int maxFs, int* erased) } } // Inversing the matrix - if (jerasure_invert_matrix(tmpmat, decMatrix, k, FTI_Conf.l3WordSize) < 0) { + if (jerasure_invert_matrix(tmpmat, decMatrix, k, FTI_Conf->l3WordSize) < 0) { FTI_Print("Error inversing matrix", FTI_DBUG); for (i = 0; i < m; i++) { @@ -91,7 +93,7 @@ int FTI_Decode(int fs, int maxFs, int* erased) return FTI_NSCS; } - if (erased[FTI_Topo.groupRank] == 0) { // Resize and open files + if (erased[FTI_Topo->groupRank] == 0) { // Resize and open files if (truncate(fn, ps) == -1) { FTI_Print("Error with truncate on checkpoint file", FTI_DBUG); @@ -158,9 +160,9 @@ int FTI_Decode(int fs, int maxFs, int* erased) // Main loop, block by block while (pos < ps) { // Reading the data - if (erased[FTI_Topo.groupRank] == 0) { - size_t data_size = fread(data[FTI_Topo.groupRank] + 0, sizeof(char), bs, fd); - size_t coding_size = fread(coding[FTI_Topo.groupRank] + 0, sizeof(char), bs, efd); + if (erased[FTI_Topo->groupRank] == 0) { + size_t data_size = fread(data[FTI_Topo->groupRank] + 0, sizeof(char), bs, fd); + size_t coding_size = fread(coding[FTI_Topo->groupRank] + 0, sizeof(char), bs, efd); if (ferror(fd) || ferror(efd)) { FTI_Print("R3 cannot from the ckpt. file or the encoded ckpt. file.", FTI_DBUG); @@ -184,33 +186,33 @@ int FTI_Decode(int fs, int maxFs, int* erased) } } else { - bzero(data[FTI_Topo.groupRank], bs); - bzero(coding[FTI_Topo.groupRank], bs); + bzero(data[FTI_Topo->groupRank], bs); + bzero(coding[FTI_Topo->groupRank], bs); } // Erasure found - MPI_Allgather(data[FTI_Topo.groupRank] + 0, bs, MPI_CHAR, dataTmp, bs, MPI_CHAR, FTI_Exec.groupComm); + MPI_Allgather(data[FTI_Topo->groupRank] + 0, bs, MPI_CHAR, dataTmp, bs, MPI_CHAR, FTI_Exec->groupComm); for (i = 0; i < k; i++) memcpy(data[i] + 0, &(dataTmp[i * bs]), sizeof(char) * bs); - MPI_Allgather(coding[FTI_Topo.groupRank] + 0, bs, MPI_CHAR, dataTmp, bs, MPI_CHAR, FTI_Exec.groupComm); + MPI_Allgather(coding[FTI_Topo->groupRank] + 0, bs, MPI_CHAR, dataTmp, bs, MPI_CHAR, FTI_Exec->groupComm); for (i = 0; i < k; i++) memcpy(coding[i] + 0, &(dataTmp[i * bs]), sizeof(char) * bs); // Decoding the lost data work - if (erased[FTI_Topo.groupRank]) - jerasure_matrix_dotprod(k, FTI_Conf.l3WordSize, decMatrix + (FTI_Topo.groupRank * k), dm_ids, FTI_Topo.groupRank, data, coding, bs); + if (erased[FTI_Topo->groupRank]) + jerasure_matrix_dotprod(k, FTI_Conf->l3WordSize, decMatrix + (FTI_Topo->groupRank * k), dm_ids, FTI_Topo->groupRank, data, coding, bs); - MPI_Allgather(data[FTI_Topo.groupRank] + 0, bs, MPI_CHAR, dataTmp, bs, MPI_CHAR, FTI_Exec.groupComm); + MPI_Allgather(data[FTI_Topo->groupRank] + 0, bs, MPI_CHAR, dataTmp, bs, MPI_CHAR, FTI_Exec->groupComm); for (i = 0; i < k; i++) memcpy(data[i] + 0, &(dataTmp[i * bs]), sizeof(char) * bs); // Finally, re-encode any erased encoded checkpoint file - if (erased[FTI_Topo.groupRank + k]) - jerasure_matrix_dotprod(k, FTI_Conf.l3WordSize, matrix + (FTI_Topo.groupRank * k), NULL, FTI_Topo.groupRank + k, data, coding, bs); - if (erased[FTI_Topo.groupRank]) - fwrite(data[FTI_Topo.groupRank] + 0, sizeof(char), bs, fd); - if (erased[FTI_Topo.groupRank + k]) - fwrite(coding[FTI_Topo.groupRank] + 0, sizeof(char), bs, efd); + if (erased[FTI_Topo->groupRank + k]) + jerasure_matrix_dotprod(k, FTI_Conf->l3WordSize, matrix + (FTI_Topo->groupRank * k), NULL, FTI_Topo->groupRank + k, data, coding, bs); + if (erased[FTI_Topo->groupRank]) + fwrite(data[FTI_Topo->groupRank] + 0, sizeof(char), bs, fd); + if (erased[FTI_Topo->groupRank + k]) + fwrite(coding[FTI_Topo->groupRank] + 0, sizeof(char), bs, efd); pos = pos + bs; } @@ -280,16 +282,17 @@ int FTI_Decode(int fs, int maxFs, int* erased) **/ /*-------------------------------------------------------------------------*/ -int FTI_RecoverL1(int group) +int FTI_RecoverL1(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, int group) { int erased[FTI_BUFS], buf, i, j; // FTI_BUFS > 32*3 unsigned long fs, maxFs; - if (FTI_CheckErasures(&fs, &maxFs, group, erased, 1) != FTI_SCES) { + if (FTI_CheckErasures(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, &fs, &maxFs, group, erased, 1) != FTI_SCES) { FTI_Print("Error checking erasures.", FTI_DBUG); return FTI_NSCS; } buf = 0; - for (j = 0; j < FTI_Topo.groupSize; j++) + for (j = 0; j < FTI_Topo->groupSize; j++) if (erased[j]) buf++; // Counting erasures if (buf > 0) { @@ -311,7 +314,8 @@ int FTI_RecoverL1(int group) **/ /*-------------------------------------------------------------------------*/ -int FTI_RecoverL2(int group) +int FTI_RecoverL2(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, int group) { int erased[FTI_BUFS], gs, buf, j, src, dest; char str[FTI_BUFS], lfn[FTI_BUFS], pfn[FTI_BUFS], jfn[FTI_BUFS], qfn[FTI_BUFS]; @@ -323,21 +327,21 @@ int FTI_RecoverL2(int group) MPI_Request reqSend1, reqRecv1, reqSend2, reqRecv2; MPI_Status status; - blBuf1 = talloc(char, FTI_Conf.blockSize); - blBuf2 = talloc(char, FTI_Conf.blockSize); - blBuf3 = talloc(char, FTI_Conf.blockSize); - blBuf4 = talloc(char, FTI_Conf.blockSize); + blBuf1 = talloc(char, FTI_Conf->blockSize); + blBuf2 = talloc(char, FTI_Conf->blockSize); + blBuf3 = talloc(char, FTI_Conf->blockSize); + blBuf4 = talloc(char, FTI_Conf->blockSize); - gs = FTI_Topo.groupSize; - src = FTI_Topo.left; - dest = FTI_Topo.right; + gs = FTI_Topo->groupSize; + src = FTI_Topo->left; + dest = FTI_Topo->right; if (mkdir(FTI_Ckpt[2].dir, 0777) == -1) if (errno != EEXIST) FTI_Print("Cannot create directory", FTI_EROR); // Checking erasures - if (FTI_CheckErasures(&fs, &maxFs, group, erased, 2) != FTI_SCES) { + if (FTI_CheckErasures(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, &fs, &maxFs, group, erased, 2) != FTI_SCES) { FTI_Print("Error checking erasures.", FTI_DBUG); free(blBuf1); @@ -369,18 +373,18 @@ int FTI_RecoverL2(int group) if (erased[j]) buf++; // Counting erasures if (buf > 0) { - ps = (maxFs / FTI_Conf.blockSize) * FTI_Conf.blockSize; + ps = (maxFs / FTI_Conf->blockSize) * FTI_Conf->blockSize; pos = 0; // For the logic if (ps < maxFs) - ps = ps + FTI_Conf.blockSize; // Calculating padding size + ps = ps + FTI_Conf->blockSize; // Calculating padding size sprintf(str, "File size: %ld, max. file size : %ld and padding size : %ld.", fs, maxFs, ps); FTI_Print(str, FTI_DBUG); // Open checkpoint file to recover - if (erased[FTI_Topo.groupRank]) { - sprintf(lfn, "%s/%s", FTI_Ckpt[2].dir, FTI_Exec.ckptFile); - sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &buf); - sprintf(jfn, "%s/Ckpt%d-Pcof%d.fti", FTI_Ckpt[2].dir, FTI_Exec.ckptID, buf); + if (erased[FTI_Topo->groupRank]) { + sprintf(lfn, "%s/%s", FTI_Ckpt[2].dir, FTI_Exec->ckptFile); + sscanf(FTI_Exec->ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec->ckptID, &buf); + sprintf(jfn, "%s/Ckpt%d-Pcof%d.fti", FTI_Ckpt[2].dir, FTI_Exec->ckptID, buf); sprintf(str, "Opening checkpoint file (%s) to recover (L2).", lfn); FTI_Print(str, FTI_DBUG); sprintf(str, "Opening partner ckpt. file (%s) to recover (L2).", jfn); @@ -414,9 +418,9 @@ int FTI_RecoverL2(int group) } // Truncate and open partner file to transfer - if (erased[src] && !erased[gs + FTI_Topo.groupRank]) { - sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &buf); - sprintf(pfn, "%s/Ckpt%d-Pcof%d.fti", FTI_Ckpt[2].dir, FTI_Exec.ckptID, buf); + if (erased[src] && !erased[gs + FTI_Topo->groupRank]) { + sscanf(FTI_Exec->ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec->ckptID, &buf); + sprintf(pfn, "%s/Ckpt%d-Pcof%d.fti", FTI_Ckpt[2].dir, FTI_Exec->ckptID, buf); sprintf(str, "Opening partner ckpt. file (%s) to transfer (L2).", pfn); FTI_Print(str, FTI_DBUG); @@ -455,8 +459,8 @@ int FTI_RecoverL2(int group) } // Truncate and open partner file to transfer - if (erased[dest] && !erased[gs + FTI_Topo.groupRank]) { - sprintf(qfn, "%s/%s", FTI_Ckpt[2].dir, FTI_Exec.ckptFile); + if (erased[dest] && !erased[gs + FTI_Topo->groupRank]) { + sprintf(qfn, "%s/%s", FTI_Ckpt[2].dir, FTI_Exec->ckptFile); sprintf(str, "Opening ckpt. file (%s) to transfer (L2).", qfn); FTI_Print(str, FTI_DBUG); @@ -500,8 +504,8 @@ int FTI_RecoverL2(int group) // Checkpoint files exchange while (pos < ps) { - if (erased[src] && !erased[gs + FTI_Topo.groupRank]) { - size_t bytes = fread(blBuf1, sizeof(char), FTI_Conf.blockSize, pfd); + if (erased[src] && !erased[gs + FTI_Topo->groupRank]) { + size_t bytes = fread(blBuf1, sizeof(char), FTI_Conf->blockSize, pfd); if (ferror(pfd)) { FTI_Print("Error reading the data from the partner ckpt. file.", FTI_DBUG); @@ -523,10 +527,10 @@ int FTI_RecoverL2(int group) return FTI_NSCS; } - MPI_Isend(blBuf1, bytes, MPI_CHAR, src, FTI_Conf.tag, FTI_Exec.groupComm, &reqSend1); + MPI_Isend(blBuf1, bytes, MPI_CHAR, src, FTI_Conf->tag, FTI_Exec->groupComm, &reqSend1); } - if (erased[dest] && !erased[gs + FTI_Topo.groupRank]) { - size_t bytes = fread(blBuf3, sizeof(char), FTI_Conf.blockSize, qfd); + if (erased[dest] && !erased[gs + FTI_Topo->groupRank]) { + size_t bytes = fread(blBuf3, sizeof(char), FTI_Conf->blockSize, qfd); if (ferror(qfd)) { FTI_Print("Error reading the data from the ckpt. file.", FTI_DBUG); @@ -548,21 +552,21 @@ int FTI_RecoverL2(int group) return FTI_NSCS; } - MPI_Isend(blBuf3, bytes, MPI_CHAR, dest, FTI_Conf.tag, FTI_Exec.groupComm, &reqSend2); + MPI_Isend(blBuf3, bytes, MPI_CHAR, dest, FTI_Conf->tag, FTI_Exec->groupComm, &reqSend2); } - if (erased[FTI_Topo.groupRank]) { - MPI_Irecv(blBuf2, FTI_Conf.blockSize, MPI_CHAR, dest, FTI_Conf.tag, FTI_Exec.groupComm, &reqRecv1); - MPI_Irecv(blBuf4, FTI_Conf.blockSize, MPI_CHAR, src, FTI_Conf.tag, FTI_Exec.groupComm, &reqRecv2); + if (erased[FTI_Topo->groupRank]) { + MPI_Irecv(blBuf2, FTI_Conf->blockSize, MPI_CHAR, dest, FTI_Conf->tag, FTI_Exec->groupComm, &reqRecv1); + MPI_Irecv(blBuf4, FTI_Conf->blockSize, MPI_CHAR, src, FTI_Conf->tag, FTI_Exec->groupComm, &reqRecv2); } - if (erased[src] && !erased[gs + FTI_Topo.groupRank]) + if (erased[src] && !erased[gs + FTI_Topo->groupRank]) MPI_Wait(&reqSend1, &status); - if (erased[dest] && !erased[gs + FTI_Topo.groupRank]) + if (erased[dest] && !erased[gs + FTI_Topo->groupRank]) MPI_Wait(&reqSend2, &status); - if (erased[FTI_Topo.groupRank]) { + if (erased[FTI_Topo->groupRank]) { MPI_Wait(&reqRecv1, &status); MPI_Wait(&reqRecv2, &status); - fwrite(blBuf2, sizeof(char), FTI_Conf.blockSize, lfd); + fwrite(blBuf2, sizeof(char), FTI_Conf->blockSize, lfd); if (ferror(lfd)) { FTI_Print("Errors writting the data in the R2 checkpoint file.", FTI_DBUG); @@ -583,7 +587,7 @@ int FTI_RecoverL2(int group) return FTI_NSCS; } - fwrite(blBuf4, sizeof(char), FTI_Conf.blockSize, jfd); + fwrite(blBuf4, sizeof(char), FTI_Conf->blockSize, jfd); if (ferror(jfd)) { FTI_Print("Errors writting the data in the R2 partner ckpt. file.", FTI_DBUG); @@ -603,11 +607,11 @@ int FTI_RecoverL2(int group) return FTI_NSCS; } } - pos = pos + FTI_Conf.blockSize; + pos = pos + FTI_Conf->blockSize; } // Close files - if (erased[FTI_Topo.groupRank]) { + if (erased[FTI_Topo->groupRank]) { if (fclose(lfd) != 0) { FTI_Print("R2 cannot close the checkpoint file.", FTI_DBUG); @@ -675,7 +679,7 @@ int FTI_RecoverL2(int group) } } - if (erased[src] && !erased[gs + FTI_Topo.groupRank]) { + if (erased[src] && !erased[gs + FTI_Topo->groupRank]) { if (fclose(pfd) != 0) { FTI_Print("R2 cannot close the partner ckpt. file", FTI_DBUG); @@ -704,7 +708,7 @@ int FTI_RecoverL2(int group) } } - if (erased[dest] && !erased[gs + FTI_Topo.groupRank]) { + if (erased[dest] && !erased[gs + FTI_Topo->groupRank]) { if (fclose(qfd) != 0) { FTI_Print("R2 cannot close the ckpt. file", FTI_DBUG); @@ -748,19 +752,20 @@ int FTI_RecoverL2(int group) **/ /*-------------------------------------------------------------------------*/ -int FTI_RecoverL3(int group) +int FTI_RecoverL3(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, int group) { int erased[FTI_BUFS], gs, j, l = 0; unsigned long fs, maxFs; char str[FTI_BUFS]; - gs = FTI_Topo.groupSize; + gs = FTI_Topo->groupSize; if (mkdir(FTI_Ckpt[3].dir, 0777) == -1) if (errno != EEXIST) FTI_Print("Cannot create directory", FTI_EROR); // Checking erasures - if (FTI_CheckErasures(&fs, &maxFs, group, erased, 3) != FTI_SCES) { + if (FTI_CheckErasures(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, &fs, &maxFs, group, erased, 3) != FTI_SCES) { FTI_Print("Error checking erasures.", FTI_DBUG); return FTI_NSCS; } @@ -782,7 +787,7 @@ int FTI_RecoverL3(int group) if (l > 0) { sprintf(str, "There are %d encoded/checkpoint files missing in this group.", l); FTI_Print(str, FTI_DBUG); - if (FTI_Decode(fs, maxFs, erased) == FTI_NSCS) { + if (FTI_Decode(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, fs, maxFs, erased) == FTI_NSCS) { FTI_Print("RS-decoding could not regenerate the missing data.", FTI_DBUG); return FTI_NSCS; } @@ -803,15 +808,16 @@ int FTI_RecoverL3(int group) **/ /*-------------------------------------------------------------------------*/ -int FTI_RecoverL4(int group) +int FTI_RecoverL4(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, int group) { unsigned long maxFs, fs, ps, pos = 0; int j, l, gs, erased[FTI_BUFS]; char gfn[FTI_BUFS], lfn[FTI_BUFS]; FILE *gfd, *lfd; - gs = FTI_Topo.groupSize; - if (FTI_Topo.nodeRank == 0 || FTI_Topo.nodeRank == 1) { + gs = FTI_Topo->groupSize; + if (FTI_Topo->nodeRank == 0 || FTI_Topo->nodeRank == 1) { if (mkdir(FTI_Ckpt[1].dir, 0777) == -1) { if (errno != EEXIST) FTI_Print("Directory L1 could NOT be created.", FTI_WARN); @@ -819,7 +825,7 @@ int FTI_RecoverL4(int group) } MPI_Barrier(FTI_COMM_WORLD); // Checking erasures - if (FTI_CheckErasures(&fs, &maxFs, group, erased, 4) != FTI_SCES) { + if (FTI_CheckErasures(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, &fs, &maxFs, group, erased, 4) != FTI_SCES) { FTI_Print("Error checking erasures.", FTI_DBUG); return FTI_NSCS; } @@ -835,14 +841,14 @@ int FTI_RecoverL4(int group) return FTI_NSCS; } - ps = (fs / FTI_Conf.blockSize) * FTI_Conf.blockSize; + ps = (fs / FTI_Conf->blockSize) * FTI_Conf->blockSize; pos = 0; // For the logic // Calculating padding size if (ps < fs) - ps = ps + FTI_Conf.blockSize; + ps = ps + FTI_Conf->blockSize; // Open and resize files - sprintf(gfn, "%s/%s", FTI_Ckpt[4].dir, FTI_Exec.ckptFile); - sprintf(lfn, "%s/%s", FTI_Ckpt[1].dir, FTI_Exec.ckptFile); + sprintf(gfn, "%s/%s", FTI_Ckpt[4].dir, FTI_Exec->ckptFile); + sprintf(lfn, "%s/%s", FTI_Ckpt[1].dir, FTI_Exec->ckptFile); if (truncate(gfn, ps) == -1) { FTI_Print("R4 cannot truncate the ckpt. file in the PFS.", FTI_DBUG); @@ -862,10 +868,10 @@ int FTI_RecoverL4(int group) return FTI_NSCS; } - char *blBuf1 = talloc(char, FTI_Conf.blockSize); + char *blBuf1 = talloc(char, FTI_Conf->blockSize); // Checkpoint files transfer from PFS while (pos < ps) { - size_t bytes = fread(blBuf1, sizeof(char), FTI_Conf.blockSize, gfd); + size_t bytes = fread(blBuf1, sizeof(char), FTI_Conf->blockSize, gfd); if (ferror(gfd)) { FTI_Print("R4 cannot read from the ckpt. file in the PFS.", FTI_DBUG); @@ -889,7 +895,7 @@ int FTI_RecoverL4(int group) return FTI_NSCS; } - pos = pos + FTI_Conf.blockSize; + pos = pos + FTI_Conf->blockSize; } free(blBuf1); diff --git a/src/recover.c b/src/recover.c index dcd1bbd19..05179d5df 100644 --- a/src/recover.c +++ b/src/recover.c @@ -56,50 +56,53 @@ int FTI_CheckFile(char* fn, unsigned long fs) **/ /*-------------------------------------------------------------------------*/ -int FTI_CheckErasures(unsigned long* fs, unsigned long* maxFs, int group, int* erased, int level) +int FTI_CheckErasures(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, + unsigned long *fs, unsigned long *maxFs, int group, + int *erased, int level) { int buf; char fn[FTI_BUFS]; - if (FTI_GetMeta(fs, maxFs, group, level) == FTI_SCES) { + if (FTI_GetMeta(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, fs, maxFs, group, level) == FTI_SCES) { FTI_Print("Metadata obtained.", FTI_DBUG); } else { FTI_Print("Error getting metadata.", FTI_WARN); return FTI_NSCS; } - sprintf(fn, "Checking file %s and its erasures.", FTI_Exec.ckptFile); + sprintf(fn, "Checking file %s and its erasures.", FTI_Exec->ckptFile); FTI_Print(fn, FTI_DBUG); switch (level) { case 1: { - sprintf(fn, "%s/%s", FTI_Ckpt[1].dir, FTI_Exec.ckptFile); + sprintf(fn, "%s/%s", FTI_Ckpt[1].dir, FTI_Exec->ckptFile); buf = FTI_CheckFile(fn, *fs); - MPI_Allgather(&buf, 1, MPI_INT, erased, 1, MPI_INT, FTI_Exec.groupComm); + MPI_Allgather(&buf, 1, MPI_INT, erased, 1, MPI_INT, FTI_Exec->groupComm); break; } case 2: { - sprintf(fn, "%s/%s", FTI_Ckpt[2].dir, FTI_Exec.ckptFile); + sprintf(fn, "%s/%s", FTI_Ckpt[2].dir, FTI_Exec->ckptFile); buf = FTI_CheckFile(fn, *fs); - MPI_Allgather(&buf, 1, MPI_INT, erased, 1, MPI_INT, FTI_Exec.groupComm); - sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &buf); - sprintf(fn, "%s/Ckpt%d-Pcof%d.fti", FTI_Ckpt[2].dir, FTI_Exec.ckptID, buf); + MPI_Allgather(&buf, 1, MPI_INT, erased, 1, MPI_INT, FTI_Exec->groupComm); + sscanf(FTI_Exec->ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec->ckptID, &buf); + sprintf(fn, "%s/Ckpt%d-Pcof%d.fti", FTI_Ckpt[2].dir, FTI_Exec->ckptID, buf); buf = FTI_CheckFile(fn, *fs); - MPI_Allgather(&buf, 1, MPI_INT, erased + FTI_Topo.groupSize, 1, MPI_INT, FTI_Exec.groupComm); + MPI_Allgather(&buf, 1, MPI_INT, erased + FTI_Topo->groupSize, 1, MPI_INT, FTI_Exec->groupComm); break; } case 3: { - sprintf(fn, "%s/%s", FTI_Ckpt[3].dir, FTI_Exec.ckptFile); + sprintf(fn, "%s/%s", FTI_Ckpt[3].dir, FTI_Exec->ckptFile); buf = FTI_CheckFile(fn, *fs); - MPI_Allgather(&buf, 1, MPI_INT, erased, 1, MPI_INT, FTI_Exec.groupComm); - sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec.ckptID, &buf); - sprintf(fn, "%s/Ckpt%d-RSed%d.fti", FTI_Ckpt[3].dir, FTI_Exec.ckptID, buf); + MPI_Allgather(&buf, 1, MPI_INT, erased, 1, MPI_INT, FTI_Exec->groupComm); + sscanf(FTI_Exec->ckptFile, "Ckpt%d-Rank%d.fti", &FTI_Exec->ckptID, &buf); + sprintf(fn, "%s/Ckpt%d-RSed%d.fti", FTI_Ckpt[3].dir, FTI_Exec->ckptID, buf); buf = FTI_CheckFile(fn, *fs); - MPI_Allgather(&buf, 1, MPI_INT, erased + FTI_Topo.groupSize, 1, MPI_INT, FTI_Exec.groupComm); + MPI_Allgather(&buf, 1, MPI_INT, erased + FTI_Topo->groupSize, 1, MPI_INT, FTI_Exec->groupComm); break; } case 4: { - sprintf(fn, "%s/%s", FTI_Ckpt[4].dir, FTI_Exec.ckptFile); + sprintf(fn, "%s/%s", FTI_Ckpt[4].dir, FTI_Exec->ckptFile); buf = FTI_CheckFile(fn, *fs); - MPI_Allgather(&buf, 1, MPI_INT, erased, 1, MPI_INT, FTI_Exec.groupComm); + MPI_Allgather(&buf, 1, MPI_INT, erased, 1, MPI_INT, FTI_Exec->groupComm); break; } } @@ -117,45 +120,46 @@ int FTI_CheckErasures(unsigned long* fs, unsigned long* maxFs, int group, int* e **/ /*-------------------------------------------------------------------------*/ -int FTI_RecoverFiles() +int FTI_RecoverFiles(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt) { int f, r, tres = FTI_SCES, id, level = 1; unsigned long fs, maxFs; char str[FTI_BUFS]; - if (FTI_Topo.nbHeads == 1) { + if (FTI_Topo->nbHeads == 1) { f = 1; } else { f = 0; } - if (!FTI_Topo.amIaHead) { + if (!FTI_Topo->amIaHead) { while (level < 5) { - if ((FTI_Exec.reco == 2) && (level != 4)) { + if ((FTI_Exec->reco == 2) && (level != 4)) { tres = FTI_NSCS; } else { - if (FTI_GetMeta(&fs, &maxFs, f, level) != FTI_SCES) { + if (FTI_GetMeta(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, &fs, &maxFs, f, level) != FTI_SCES) { tres = FTI_NSCS; } else { - sscanf(FTI_Exec.ckptFile, "Ckpt%d-Rank%d.fti", &id, &r); + sscanf(FTI_Exec->ckptFile, "Ckpt%d-Rank%d.fti", &id, &r); sprintf(str, "Trying recovery with Ckpt. %d at level %d.", id, level); FTI_Print(str, FTI_DBUG); - FTI_Exec.ckptID = id; - FTI_Exec.ckptLvel = level; - FTI_Exec.lastCkptLvel = FTI_Exec.ckptLvel; - if (FTI_Exec.ckptLvel == 4) { - FTI_Clean(1, FTI_Topo.groupID, FTI_Topo.myRank); + FTI_Exec->ckptID = id; + FTI_Exec->ckptLvel = level; + FTI_Exec->lastCkptLvel = FTI_Exec->ckptLvel; + if (FTI_Exec->ckptLvel == 4) { + FTI_Clean(FTI_Conf, FTI_Topo, FTI_Ckpt, 1, FTI_Topo->groupID, FTI_Topo->myRank); MPI_Barrier(FTI_COMM_WORLD); } - if (FTI_Exec.ckptLvel == 4) - r = FTI_RecoverL4(FTI_Topo.groupID); - if (FTI_Exec.ckptLvel == 3) - r = FTI_RecoverL3(FTI_Topo.groupID); - if (FTI_Exec.ckptLvel == 2) - r = FTI_RecoverL2(FTI_Topo.groupID); - if (FTI_Exec.ckptLvel == 1) - r = FTI_RecoverL1(FTI_Topo.groupID); + if (FTI_Exec->ckptLvel == 4) + r = FTI_RecoverL4(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, FTI_Topo->groupID); + if (FTI_Exec->ckptLvel == 3) + r = FTI_RecoverL3(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, FTI_Topo->groupID); + if (FTI_Exec->ckptLvel == 2) + r = FTI_RecoverL2(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, FTI_Topo->groupID); + if (FTI_Exec->ckptLvel == 1) + r = FTI_RecoverL1(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, FTI_Topo->groupID); MPI_Allreduce(&r, &tres, 1, MPI_INT, MPI_SUM, FTI_COMM_WORLD); } } @@ -173,8 +177,8 @@ int FTI_RecoverFiles() } } fs = tres; - MPI_Allreduce(&fs, &tres, 1, MPI_INT, MPI_SUM, FTI_Exec.globalComm); - MPI_Barrier(FTI_Exec.globalComm); + MPI_Allreduce(&fs, &tres, 1, MPI_INT, MPI_SUM, FTI_Exec->globalComm); + MPI_Barrier(FTI_Exec->globalComm); sleep(1); // Global barrier and sleep for clearer output return tres; } diff --git a/src/tools.c b/src/tools.c index 8e171bb0d..ab58e9fda 100644 --- a/src/tools.c +++ b/src/tools.c @@ -8,48 +8,6 @@ #include "interface.h" #include -int FTI_Clean(int level, int group, int rank); - -/*-------------------------------------------------------------------------*/ -/** - @brief Prints FTI messages. - @param msg Message to print. - @param priority Priority of the message to be printed. - @return void - - This function prints messages depending on their priority and the - verbosity level set by the user. DEBUG messages are printed by all - processes with their rank. INFO messages are printed by one process. - ERROR messages are printed with errno. - - **/ -/*-------------------------------------------------------------------------*/ -void FTI_Print(char* msg, int priority) -{ - if (priority >= FTI_Conf.verbosity) { - if (msg != NULL) { - switch (priority) { - case FTI_EROR: - fprintf(stderr, "[FTI Error - %06d] : %s : %s \n", FTI_Topo.myRank, msg, strerror(errno)); - break; - case FTI_WARN: - fprintf(stdout, "[FTI Warning %06d] : %s \n", FTI_Topo.myRank, msg); - break; - case FTI_INFO: - if (FTI_Topo.splitRank == 0) - fprintf(stdout, "[ FTI Information ] : %s \n", msg); - break; - case FTI_DBUG: - fprintf(stdout, "[FTI Debug - %06d] : %s \n", FTI_Topo.myRank, msg); - break; - default: - break; - } - } - } - fflush(stdout); -} - /*-------------------------------------------------------------------------*/ /** @brief Receive the return code of a function and print a message. @@ -88,7 +46,7 @@ int FTI_Try(int result, char* message) **/ /*-------------------------------------------------------------------------*/ -int FTI_InitBasicTypes(FTIT_dataset FTI_Data[FTI_BUFS]) +int FTI_InitBasicTypes(FTIT_dataset* FTI_Data) { int i; for (i = 0; i < FTI_BUFS; i++) { @@ -125,30 +83,37 @@ int FTI_InitBasicTypes(FTIT_dataset FTI_Data[FTI_BUFS]) int FTI_RmDir(char path[FTI_BUFS], int flag) { if (flag) { - DIR* dp; char buf[FTI_BUFS], fn[FTI_BUFS], fil[FTI_BUFS]; - struct dirent* ep; - dp = opendir(path); + DIR* dp = NULL; + struct dirent* ep = NULL; + sprintf(buf, "Removing directory %s and its files.", path); FTI_Print(buf, FTI_DBUG); + + dp = opendir(path); if (dp != NULL) { - while (ep = readdir(dp)) { + while ((ep = readdir(dp)) != NULL) { sprintf(fil, "%s", ep->d_name); if ((strcmp(fil, ".") != 0) && (strcmp(fil, "..") != 0)) { sprintf(fn, "%s/%s", path, fil); sprintf(buf, "File %s will be removed.", fn); FTI_Print(buf, FTI_DBUG); - if (remove(fn) != 0) - FTI_Print("Error removing target file.", FTI_EROR); + if (remove(fn) == -1) + if (errno != ENOENT) + FTI_Print("Error removing target file.", FTI_EROR); } } } else { - FTI_Print("Error with opendir.", FTI_EROR); + if (errno != ENOENT) + FTI_Print("Error with opendir.", FTI_EROR); } - closedir(dp); - if (remove(path) != 0) - FTI_Print("Error removing target directory.", FTI_EROR); + if (dp != NULL) + closedir(dp); + + if (remove(path) == -1) + if (errno != ENOENT) + FTI_Print("Error removing target directory.", FTI_EROR); } return FTI_SCES; } @@ -167,17 +132,18 @@ int FTI_RmDir(char path[FTI_BUFS], int flag) **/ /*-------------------------------------------------------------------------*/ -int FTI_Clean(int level, int group, int rank) +int FTI_Clean(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo, + FTIT_checkpoint* FTI_Ckpt, int level, int group, int rank) { char buf[FTI_BUFS]; - int nodeFlag, globalFlag = !FTI_Topo.splitRank; + int nodeFlag, globalFlag = !FTI_Topo->splitRank; - nodeFlag = (((!FTI_Topo.amIaHead) && (FTI_Topo.nodeRank == 0)) || (FTI_Topo.amIaHead)) ? 1 : 0; + nodeFlag = (((!FTI_Topo->amIaHead) && (FTI_Topo->nodeRank == 0)) || (FTI_Topo->amIaHead)) ? 1 : 0; if (level == 0) { - FTI_RmDir(FTI_Conf.mTmpDir, globalFlag); - FTI_RmDir(FTI_Conf.gTmpDir, globalFlag); - FTI_RmDir(FTI_Conf.lTmpDir, nodeFlag); + FTI_RmDir(FTI_Conf->mTmpDir, globalFlag); + FTI_RmDir(FTI_Conf->gTmpDir, globalFlag); + FTI_RmDir(FTI_Conf->lTmpDir, nodeFlag); } // Clean last checkpoint level 1 @@ -202,24 +168,25 @@ int FTI_Clean(int level, int group, int rank) if (level == 4 || level == 5) { FTI_RmDir(FTI_Ckpt[4].metaDir, globalFlag); FTI_RmDir(FTI_Ckpt[4].dir, globalFlag); - rmdir(FTI_Conf.gTmpDir); + rmdir(FTI_Conf->gTmpDir); } // If it is the very last cleaning and we DO NOT keep the last checkpoint if (level == 5) { - rmdir(FTI_Conf.lTmpDir); - rmdir(FTI_Conf.localDir); - rmdir(FTI_Conf.glbalDir); - snprintf(buf, FTI_BUFS, "%s/Topology.fti", FTI_Conf.metadDir); + rmdir(FTI_Conf->lTmpDir); + rmdir(FTI_Conf->localDir); + rmdir(FTI_Conf->glbalDir); + snprintf(buf, FTI_BUFS, "%s/Topology.fti", FTI_Conf->metadDir); if (remove(buf) == -1) - FTI_Print("Cannot remove Topology.fti", FTI_EROR); - rmdir(FTI_Conf.metadDir); + if (errno != ENOENT) + FTI_Print("Cannot remove Topology.fti", FTI_EROR); + rmdir(FTI_Conf->metadDir); } // If it is the very last cleaning and we DO keep the last checkpoint if (level == 6) { - rmdir(FTI_Conf.lTmpDir); - rmdir(FTI_Conf.localDir); + rmdir(FTI_Conf->lTmpDir); + rmdir(FTI_Conf->localDir); } return FTI_SCES; diff --git a/src/topo.c b/src/topo.c index 6cd8c85f3..b2189b152 100644 --- a/src/topo.c +++ b/src/topo.c @@ -19,16 +19,16 @@ **/ /*-------------------------------------------------------------------------*/ -int FTI_SaveTopo(char* nameList) +int FTI_SaveTopo(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo, char* nameList) { char mfn[FTI_BUFS], str[FTI_BUFS]; dictionary* ini; int i; - sprintf(str, "Trying to load configuration file (%s) to create topology.", FTI_Conf.cfgFile); + sprintf(str, "Trying to load configuration file (%s) to create topology.", FTI_Conf->cfgFile); FTI_Print(str, FTI_DBUG); - ini = iniparser_load(FTI_Conf.cfgFile); + ini = iniparser_load(FTI_Conf->cfgFile); if (ini == NULL) { FTI_Print("Iniparser cannot parse the configuration file.", FTI_WARN); @@ -39,7 +39,7 @@ int FTI_SaveTopo(char* nameList) iniparser_set(ini, "topology", NULL); // Write list of nodes - for (i = 0; i < FTI_Topo.nbNodes; i++) { + for (i = 0; i < FTI_Topo->nbNodes; i++) { strncpy(mfn, nameList + (i * FTI_BUFS), FTI_BUFS - 1); sprintf(str, "topology:%d", i); iniparser_set(ini, str, mfn); @@ -50,7 +50,7 @@ int FTI_SaveTopo(char* nameList) iniparser_unset(ini, "restart"); iniparser_unset(ini, "advanced"); - sprintf(mfn, "%s/Topology.fti", FTI_Conf.metadDir); + sprintf(mfn, "%s/Topology.fti", FTI_Conf->metadDir); sprintf(str, "Creating topology file (%s)...", mfn); FTI_Print(str, FTI_DBUG); @@ -100,20 +100,21 @@ int FTI_SaveTopo(char* nameList) **/ /*-------------------------------------------------------------------------*/ -int FTI_ReorderNodes(int* nodeList, char* nameList) +int FTI_ReorderNodes(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo, + int* nodeList, char* nameList) { char mfn[FTI_BUFS], str[FTI_BUFS], *tmp; int i, j, *nl, *old, *new; - nl = talloc(int, FTI_Topo.nbProc); - old = talloc(int, FTI_Topo.nbNodes); - new = talloc(int, FTI_Topo.nbNodes); - for (i = 0; i < FTI_Topo.nbNodes; i++) { + nl = talloc(int, FTI_Topo->nbProc); + old = talloc(int, FTI_Topo->nbNodes); + new = talloc(int, FTI_Topo->nbNodes); + for (i = 0; i < FTI_Topo->nbNodes; i++) { old[i] = -1; new[i] = -1; } - sprintf(mfn, "%s/Topology.fti", FTI_Conf.metadDir); + sprintf(mfn, "%s/Topology.fti", FTI_Conf->metadDir); sprintf(str, "Loading FTI topology file (%s) to reorder nodes...", mfn); FTI_Print(str, FTI_DBUG); @@ -141,13 +142,13 @@ int FTI_ReorderNodes(int* nodeList, char* nameList) } // Get the old order of nodes - for (i = 0; i < FTI_Topo.nbNodes; i++) { + for (i = 0; i < FTI_Topo->nbNodes; i++) { sprintf(str, "Topology:%d", i); tmp = iniparser_getstring(ini, str, NULL); snprintf(str, FTI_BUFS, "%s", tmp); // Search for same node in current nameList - for (j = 0; j < FTI_Topo.nbNodes; j++) { + for (j = 0; j < FTI_Topo->nbNodes; j++) { // If found... if (strncmp(str, nameList + (j * FTI_BUFS), FTI_BUFS) == 0) { old[j] = i; @@ -161,7 +162,7 @@ int FTI_ReorderNodes(int* nodeList, char* nameList) j = 0; // Introducing missing nodes - for (i = 0; i < FTI_Topo.nbNodes; i++) { + for (i = 0; i < FTI_Topo->nbNodes; i++) { // For each new node.. if (new[i] == -1) { // ..search for an old node not present in the new list.. @@ -175,13 +176,13 @@ int FTI_ReorderNodes(int* nodeList, char* nameList) } } // Copying nodeList in nl - for (i = 0; i < FTI_Topo.nbProc; i++) { + for (i = 0; i < FTI_Topo->nbProc; i++) { nl[i] = nodeList[i]; } // Creating the new nodeList with the old order - for (i = 0; i < FTI_Topo.nbNodes; i++) { - for (j = 0; j < FTI_Topo.nodeSize; j++) { - nodeList[(i * FTI_Topo.nodeSize) + j] = nl[(new[i] * FTI_Topo.nodeSize) + j]; + for (i = 0; i < FTI_Topo->nbNodes; i++) { + for (j = 0; j < FTI_Topo->nodeSize; j++) { + nodeList[(i * FTI_Topo->nodeSize) + j] = nl[(new[i] * FTI_Topo->nodeSize) + j]; } } @@ -206,22 +207,23 @@ int FTI_ReorderNodes(int* nodeList, char* nameList) **/ /*-------------------------------------------------------------------------*/ -int FTI_BuildNodeList(int* nodeList, char* nameList) +int FTI_BuildNodeList(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, int* nodeList, char* nameList) { int i, found, pos, p, nbNodes = 0; char hname[FTI_BUFS], str[FTI_BUFS], *lhn; - lhn = talloc(char, FTI_BUFS* FTI_Topo.nbProc); - memset(lhn + (FTI_Topo.myRank * FTI_BUFS), 0, FTI_BUFS); // To get local hostname - if (!FTI_Conf.test) { - gethostname(lhn + (FTI_Topo.myRank * FTI_BUFS), FTI_BUFS); // NOT local test + lhn = talloc(char, FTI_BUFS* FTI_Topo->nbProc); + memset(lhn + (FTI_Topo->myRank * FTI_BUFS), 0, FTI_BUFS); // To get local hostname + if (!FTI_Conf->test) { + gethostname(lhn + (FTI_Topo->myRank * FTI_BUFS), FTI_BUFS); // NOT local test } else { - snprintf(lhn + (FTI_Topo.myRank * FTI_BUFS), FTI_BUFS, "node%d", FTI_Topo.myRank / FTI_Topo.nodeSize); // Local + snprintf(lhn + (FTI_Topo->myRank * FTI_BUFS), FTI_BUFS, "node%d", FTI_Topo->myRank / FTI_Topo->nodeSize); // Local } - strncpy(hname, lhn + (FTI_Topo.myRank * FTI_BUFS), FTI_BUFS - 1); // Distributing host names - MPI_Allgather(hname, FTI_BUFS, MPI_CHAR, lhn, FTI_BUFS, MPI_CHAR, FTI_Exec.globalComm); + strncpy(hname, lhn + (FTI_Topo->myRank * FTI_BUFS), FTI_BUFS - 1); // Distributing host names + MPI_Allgather(hname, FTI_BUFS, MPI_CHAR, lhn, FTI_BUFS, MPI_CHAR, FTI_Exec->globalComm); - for (i = 0; i < FTI_Topo.nbProc; i++) { // Creating the node list: For each process + for (i = 0; i < FTI_Topo->nbProc; i++) { // Creating the node list: For each process found = 0; pos = 0; strncpy(hname, lhn + (i * FTI_BUFS), FTI_BUFS - 1); // Get node name of process i @@ -234,8 +236,8 @@ int FTI_BuildNodeList(int* nodeList, char* nameList) } } if (found) { // If we found the node name in the current list... - p = pos * FTI_Topo.nodeSize; - while (p < pos * FTI_Topo.nodeSize + FTI_Topo.nodeSize) { // ... we look for empty spot in this node + p = pos * FTI_Topo->nodeSize; + while (p < pos * FTI_Topo->nodeSize + FTI_Topo->nodeSize) { // ... we look for empty spot in this node if (nodeList[p] == -1) { nodeList[p] = i; break; @@ -247,13 +249,13 @@ int FTI_BuildNodeList(int* nodeList, char* nameList) } else { // ... else, we add the new node to the end of the current list of nodes strncpy(&(nameList[pos * FTI_BUFS]), hname, FTI_BUFS - 1); - nodeList[pos * FTI_Topo.nodeSize] = i; + nodeList[pos * FTI_Topo->nodeSize] = i; nbNodes++; } } - for (i = 0; i < FTI_Topo.nbProc; i++) { // Checking that all nodes have nodeSize processes + for (i = 0; i < FTI_Topo->nbProc; i++) { // Checking that all nodes have nodeSize processes if (nodeList[i] == -1) { - sprintf(str, "Node %d has no %d processes", i / FTI_Topo.nodeSize, FTI_Topo.nodeSize); + sprintf(str, "Node %d has no %d processes", i / FTI_Topo->nodeSize, FTI_Topo->nodeSize); FTI_Print(str, FTI_WARN); return FTI_NSCS; } @@ -276,42 +278,44 @@ int FTI_BuildNodeList(int* nodeList, char* nameList) **/ /*-------------------------------------------------------------------------*/ -int FTI_CreateComms(int* userProcList, int* distProcList, int* nodeList) +int FTI_CreateComms(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, int* userProcList, + int* distProcList, int* nodeList) { MPI_Status status; char str[FTI_BUFS]; MPI_Group newGroup, origGroup; - MPI_Comm_group(FTI_Exec.globalComm, &origGroup); + MPI_Comm_group(FTI_Exec->globalComm, &origGroup); int i, src, buf, group[FTI_BUFS]; // FTI_BUFS > Max. group size - if (FTI_Topo.amIaHead) { - MPI_Group_incl(origGroup, FTI_Topo.nbNodes * FTI_Topo.nbHeads, distProcList, &newGroup); - MPI_Comm_create(FTI_Exec.globalComm, newGroup, &FTI_COMM_WORLD); - for (i = FTI_Topo.nbHeads; i < FTI_Topo.nodeSize; i++) { - src = nodeList[(FTI_Topo.nodeID * FTI_Topo.nodeSize) + i]; - MPI_Recv(&buf, 1, MPI_INT, src, FTI_Conf.tag, FTI_Exec.globalComm, &status); + if (FTI_Topo->amIaHead) { + MPI_Group_incl(origGroup, FTI_Topo->nbNodes * FTI_Topo->nbHeads, distProcList, &newGroup); + MPI_Comm_create(FTI_Exec->globalComm, newGroup, &FTI_COMM_WORLD); + for (i = FTI_Topo->nbHeads; i < FTI_Topo->nodeSize; i++) { + src = nodeList[(FTI_Topo->nodeID * FTI_Topo->nodeSize) + i]; + MPI_Recv(&buf, 1, MPI_INT, src, FTI_Conf->tag, FTI_Exec->globalComm, &status); if (buf == src) { - FTI_Topo.body[i - FTI_Topo.nbHeads] = src; + FTI_Topo->body[i - FTI_Topo->nbHeads] = src; } } } else { - MPI_Group_incl(origGroup, FTI_Topo.nbProc - (FTI_Topo.nbNodes * FTI_Topo.nbHeads), userProcList, &newGroup); - MPI_Comm_create(FTI_Exec.globalComm, newGroup, &FTI_COMM_WORLD); - if (FTI_Topo.nbHeads == 1) { - MPI_Send(&(FTI_Topo.myRank), 1, MPI_INT, FTI_Topo.headRank, FTI_Conf.tag, FTI_Exec.globalComm); + MPI_Group_incl(origGroup, FTI_Topo->nbProc - (FTI_Topo->nbNodes * FTI_Topo->nbHeads), userProcList, &newGroup); + MPI_Comm_create(FTI_Exec->globalComm, newGroup, &FTI_COMM_WORLD); + if (FTI_Topo->nbHeads == 1) { + MPI_Send(&(FTI_Topo->myRank), 1, MPI_INT, FTI_Topo->headRank, FTI_Conf->tag, FTI_Exec->globalComm); } } - MPI_Comm_rank(FTI_COMM_WORLD, &FTI_Topo.splitRank); - buf = FTI_Topo.sectorID * FTI_Topo.groupSize; - for (i = 0; i < FTI_Topo.groupSize; i++) { // Group of node-distributed processes (Topology-aware). + MPI_Comm_rank(FTI_COMM_WORLD, &FTI_Topo->splitRank); + buf = FTI_Topo->sectorID * FTI_Topo->groupSize; + for (i = 0; i < FTI_Topo->groupSize; i++) { // Group of node-distributed processes (Topology-aware). group[i] = distProcList[buf + i]; } - MPI_Comm_group(FTI_Exec.globalComm, &origGroup); - MPI_Group_incl(origGroup, FTI_Topo.groupSize, group, &newGroup); - MPI_Comm_create(FTI_Exec.globalComm, newGroup, &FTI_Exec.groupComm); - MPI_Group_rank(newGroup, &(FTI_Topo.groupRank)); - FTI_Topo.right = (FTI_Topo.groupRank + 1) % FTI_Topo.groupSize; - FTI_Topo.left = (FTI_Topo.groupRank + FTI_Topo.groupSize - 1) % FTI_Topo.groupSize; + MPI_Comm_group(FTI_Exec->globalComm, &origGroup); + MPI_Group_incl(origGroup, FTI_Topo->groupSize, group, &newGroup); + MPI_Comm_create(FTI_Exec->globalComm, newGroup, &FTI_Exec->groupComm); + MPI_Group_rank(newGroup, &(FTI_Topo->groupRank)); + FTI_Topo->right = (FTI_Topo->groupRank + 1) % FTI_Topo->groupSize; + FTI_Topo->left = (FTI_Topo->groupRank + FTI_Topo->groupSize - 1) % FTI_Topo->groupSize; MPI_Group_free(&origGroup); MPI_Group_free(&newGroup); return FTI_SCES; @@ -328,17 +332,18 @@ int FTI_CreateComms(int* userProcList, int* distProcList, int* nodeList) **/ /*-------------------------------------------------------------------------*/ -int FTI_Topology() +int FTI_Topology(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo) { int res, nn, found, c1 = 0, c2 = 0, p, i, mypos = -1, posInNode; - char str[FTI_BUFS], *nameList = talloc(char, FTI_Topo.nbNodes *FTI_BUFS); + char str[FTI_BUFS], *nameList = talloc(char, FTI_Topo->nbNodes *FTI_BUFS); - int* nodeList = talloc(int, FTI_Topo.nbNodes* FTI_Topo.nodeSize); - for (i = 0; i < FTI_Topo.nbProc; i++) { + int* nodeList = talloc(int, FTI_Topo->nbNodes* FTI_Topo->nodeSize); + for (i = 0; i < FTI_Topo->nbProc; i++) { nodeList[i] = -1; } - res = FTI_Try(FTI_BuildNodeList(nodeList, nameList), "create node list."); + res = FTI_Try(FTI_BuildNodeList(FTI_Conf, FTI_Exec, FTI_Topo, nodeList, nameList), "create node list."); if (res == FTI_NSCS) { free(nameList); free(nodeList); @@ -346,8 +351,8 @@ int FTI_Topology() return FTI_NSCS; } - if (FTI_Exec.reco > 0) { - res = FTI_Try(FTI_ReorderNodes(nodeList, nameList), "reorder nodes."); + if (FTI_Exec->reco > 0) { + res = FTI_Try(FTI_ReorderNodes(FTI_Conf, FTI_Topo, nodeList, nameList), "reorder nodes."); if (res == FTI_NSCS) { free(nameList); free(nodeList); @@ -357,9 +362,9 @@ int FTI_Topology() } // Need to synchronize before editing topology file - MPI_Barrier(FTI_Exec.globalComm); - if (FTI_Topo.myRank == 0 && FTI_Exec.reco == 0) { - res = FTI_Try(FTI_SaveTopo(nameList), "save topology."); + MPI_Barrier(FTI_Exec->globalComm); + if (FTI_Topo->myRank == 0 && FTI_Exec->reco == 0) { + res = FTI_Try(FTI_SaveTopo(FTI_Conf, FTI_Topo, nameList), "save topology."); if (res == FTI_NSCS) { free(nameList); free(nodeList); @@ -368,14 +373,14 @@ int FTI_Topology() } } - int *distProcList = talloc(int, FTI_Topo.nbNodes); - int *userProcList = talloc(int, FTI_Topo.nbProc - (FTI_Topo.nbNodes * FTI_Topo.nbHeads)); + int *distProcList = talloc(int, FTI_Topo->nbNodes); + int *userProcList = talloc(int, FTI_Topo->nbProc - (FTI_Topo->nbNodes * FTI_Topo->nbHeads)); - for (i = 0; i < FTI_Topo.nbProc; i++) { - if (FTI_Topo.myRank == nodeList[i]) { + for (i = 0; i < FTI_Topo->nbProc; i++) { + if (FTI_Topo->myRank == nodeList[i]) { mypos = i; } - if ((i % FTI_Topo.nodeSize != 0) || (FTI_Topo.nbHeads == 0)) { + if ((i % FTI_Topo->nodeSize != 0) || (FTI_Topo->nbHeads == 0)) { userProcList[c2] = nodeList[i]; c2++; } @@ -389,23 +394,23 @@ int FTI_Topology() return FTI_NSCS; } - FTI_Topo.nodeRank = mypos % FTI_Topo.nodeSize; - if (FTI_Topo.nodeRank == 0 && FTI_Topo.nbHeads == 1) { - FTI_Topo.amIaHead = 1; + FTI_Topo->nodeRank = mypos % FTI_Topo->nodeSize; + if (FTI_Topo->nodeRank == 0 && FTI_Topo->nbHeads == 1) { + FTI_Topo->amIaHead = 1; } else { - FTI_Topo.amIaHead = 0; + FTI_Topo->amIaHead = 0; } - FTI_Topo.nodeID = mypos / FTI_Topo.nodeSize; - FTI_Topo.headRank = nodeList[(mypos / FTI_Topo.nodeSize) * FTI_Topo.nodeSize]; - FTI_Topo.sectorID = FTI_Topo.nodeID / FTI_Topo.groupSize; - posInNode = mypos % FTI_Topo.nodeSize; - FTI_Topo.groupID = posInNode; - for (i = 0; i < FTI_Topo.nbNodes; i++) { - distProcList[i] = nodeList[(FTI_Topo.nodeSize * i) + posInNode]; + FTI_Topo->nodeID = mypos / FTI_Topo->nodeSize; + FTI_Topo->headRank = nodeList[(mypos / FTI_Topo->nodeSize) * FTI_Topo->nodeSize]; + FTI_Topo->sectorID = FTI_Topo->nodeID / FTI_Topo->groupSize; + posInNode = mypos % FTI_Topo->nodeSize; + FTI_Topo->groupID = posInNode; + for (i = 0; i < FTI_Topo->nbNodes; i++) { + distProcList[i] = nodeList[(FTI_Topo->nodeSize * i) + posInNode]; } - res = FTI_Try(FTI_CreateComms(userProcList, distProcList, nodeList), "create communicators."); + res = FTI_Try(FTI_CreateComms(FTI_Conf, FTI_Exec, FTI_Topo, userProcList, distProcList, nodeList), "create communicators."); if (res == FTI_NSCS) { free(userProcList); free(distProcList); From 8e9bec5faa8e6fe29c78016667123f27abb1c5cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Wed, 1 Jun 2016 13:50:55 +0200 Subject: [PATCH 86/93] Enabling fortran wrapper and examples --- CMakeLists.txt | 4 ++-- examples/CMakeLists.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ae08d395..df5bb97b3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,8 +4,8 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") project(FTI C Fortran) endif() -option(ENABLE_FORTRAN "Enables the generation of the Fortran wrapper for FTI" OFF) -option(ENABLE_EXAMPLES "Enables the generation of examples" OFF) +option(ENABLE_FORTRAN "Enables the generation of the Fortran wrapper for FTI" ON) +option(ENABLE_EXAMPLES "Enables the generation of examples" ON) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeScripts") include(AppendProperty) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index a0e507bda..449f1b9df 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -28,7 +28,7 @@ endif() if(ENABLE_FORTRAN) add_executable(hdf.exe fheatdis.f90) - target_link_libraries(hdf.exe fti_f90 fti ${MPI_Fortran_LIBRARIES} m) + target_link_libraries(hdf.exe fti_f90 ${CMAKE_BINARY_DIR}/lib/libfti.a ${MPI_Fortran_LIBRARIES} m) if(MPI_Fortran_COMPILE_FLAGS) set_target_properties(hdf.exe PROPERTIES From 49679076e44a8220458a2988accc502b7f2d6c8d Mon Sep 17 00:00:00 2001 From: Leonardo Bautista Gomez Date: Mon, 6 Jun 2016 19:11:02 +0200 Subject: [PATCH 87/93] Solving bug about rename/erase local files. --- src/checkpoint.c | 2 +- src/tools.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/checkpoint.c b/src/checkpoint.c index 9f9629f5d..5fb865075 100644 --- a/src/checkpoint.c +++ b/src/checkpoint.c @@ -222,7 +222,7 @@ int FTI_PostCkpt(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, FTI_GroupClean(FTI_Conf, FTI_Topo, FTI_Ckpt, FTI_Exec->ckptLvel, group, pr); MPI_Barrier(FTI_COMM_WORLD); - nodeFlag = (((!FTI_Topo->amIaHead) && (FTI_Topo->nodeRank == 0)) || (FTI_Topo->amIaHead)) ? 1 : 0; + nodeFlag = (((!FTI_Topo->amIaHead) && ((FTI_Topo->nodeRank - FTI_Topo->nbHeads) == 0)) || (FTI_Topo->amIaHead)) ? 1 : 0; if (nodeFlag) { level = (FTI_Exec->ckptLvel != 4) ? FTI_Exec->ckptLvel : 1; if (rename(FTI_Conf->lTmpDir, FTI_Ckpt[level].dir) == -1) diff --git a/src/tools.c b/src/tools.c index ab58e9fda..313a404a2 100644 --- a/src/tools.c +++ b/src/tools.c @@ -138,7 +138,7 @@ int FTI_Clean(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo, char buf[FTI_BUFS]; int nodeFlag, globalFlag = !FTI_Topo->splitRank; - nodeFlag = (((!FTI_Topo->amIaHead) && (FTI_Topo->nodeRank == 0)) || (FTI_Topo->amIaHead)) ? 1 : 0; + nodeFlag = (((!FTI_Topo->amIaHead) && ((FTI_Topo->nodeRank - FTI_Topo->nbHeads) == 0)) || (FTI_Topo->amIaHead)) ? 1 : 0; if (level == 0) { FTI_RmDir(FTI_Conf->mTmpDir, globalFlag); From cf96029659adb848603bd072ab184c16728554da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awek=20Zdanowski?= Date: Tue, 7 Jun 2016 12:34:54 +0200 Subject: [PATCH 88/93] Fixing incorrect warnings for no-head ranks --- src/api.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/api.c b/src/api.c index 1fe6af92d..d7bd3b531 100644 --- a/src/api.c +++ b/src/api.c @@ -312,8 +312,8 @@ int FTI_BitFlip(int datasetID) /*-------------------------------------------------------------------------*/ int FTI_Checkpoint(int id, int level) { - int i, res = FTI_NSCS; - double t0, t1, t2, t3, t4; + int res = FTI_NSCS, value; + double t0, t1, t2, t3; char str[FTI_BUFS]; MPI_Status status; if ((level > 0) && (level < 5)) { @@ -337,12 +337,12 @@ int FTI_Checkpoint(int id, int level) if (!FTI_Ckpt[FTI_Exec.ckptLvel].isInline) { // If postCkpt. work is Async. then send message.. FTI_Exec.wasLastOffline = 1; if (res != FTI_SCES) { - res = FTI_REJW; + value = FTI_REJW; } else { - res = FTI_BASE + FTI_Exec.ckptLvel; + value = FTI_BASE + FTI_Exec.ckptLvel; } - MPI_Send(&res, 1, MPI_INT, FTI_Topo.headRank, FTI_Conf.tag, FTI_Exec.globalComm); + MPI_Send(&value, 1, MPI_INT, FTI_Topo.headRank, FTI_Conf.tag, FTI_Exec.globalComm); } else { FTI_Exec.wasLastOffline = 0; From 987bc9f03431f140b3f86c909b31a88d2da034f3 Mon Sep 17 00:00:00 2001 From: Leonardo Bautista Gomez Date: Tue, 7 Jun 2016 15:34:27 +0200 Subject: [PATCH 89/93] Removing unnecessary things from the example. --- examples/heatdis.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/heatdis.c b/examples/heatdis.c index 6523e5bf4..abbfd030e 100644 --- a/examples/heatdis.c +++ b/examples/heatdis.c @@ -13,7 +13,7 @@ #define PRECISION 0.005 -#define ITER_TIMES 2000 +#define ITER_TIMES 5000 #define ITER_OUT 500 #define WORKTAG 50 #define REDUCE 5 @@ -124,8 +124,6 @@ int main(int argc, char *argv[]) for(i = 0; i < ITER_TIMES; i++) { int checkpointed = FTI_Snapshot(); - if (checkpointed == 1) printf("A checkpoint was performed. \n"); - FTI_BitFlip(2); localerror = doWork(nbProcs, rank, M, nbLines, g, h); if (((i%ITER_OUT) == 0) && (rank == 0)) printf("Step : %d, error = %f\n", i, globalerror); if ((i%REDUCE) == 0) MPI_Allreduce(&localerror, &globalerror, 1, MPI_DOUBLE, MPI_MAX, FTI_COMM_WORLD); From cb9d8fa00e9250fa63ac3ed5267c9b0b72c00db8 Mon Sep 17 00:00:00 2001 From: Leonardo Bautista Gomez Date: Tue, 7 Jun 2016 16:18:19 +0200 Subject: [PATCH 90/93] Test with less memory to run faster. --- examples/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/Makefile b/examples/Makefile index 638de22b5..319619351 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -2,7 +2,7 @@ ## Makefile to test FTI library with the heat distribution program. ## PLEASE SET THIS VARIABLE BEFORE COMPILING -FTIPATH = /path/to/fti/install/directory +FTIPATH = /path/to/fti/ ## COMPILERS MPICC ?= mpicc @@ -30,7 +30,7 @@ hdf: fheatdis.f90 $(MPIFC) -o hdf.exe fheatdis.f90 $(FFTIFLAG) hdt: - $(MPIRUN) -np 8 ./hd.exe 32 config.fti + $(MPIRUN) -np 8 ./hd.exe 4 config.fti hdv: ./plot.sh From 82aa10276a4554493f586fcec629f3fffe166166 Mon Sep 17 00:00:00 2001 From: Leonardo Bautista Gomez Date: Tue, 7 Jun 2016 17:00:23 +0200 Subject: [PATCH 91/93] Fixing bad error warning. --- src/api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/api.c b/src/api.c index 1fe6af92d..14017b099 100644 --- a/src/api.c +++ b/src/api.c @@ -358,7 +358,7 @@ int FTI_Checkpoint(int id, int level) sprintf(str, "%s taken in %.2f sec.", str, t3 - t0); sprintf(str, "%s (Wt:%.2fs, Wr:%.2fs, Ps:%.2fs)", str, t1 - t0, t2 - t1, t3 - t2); FTI_Print(str, FTI_INFO); - if (res == FTI_SCES) + if (res != FTI_NSCS) res = FTI_DONE; else res = FTI_NSCS; From 78cf0d40a7861895b0c8aa55653e5746f5a0b12e Mon Sep 17 00:00:00 2001 From: Gaalich Mohamed Date: Wed, 8 Jun 2016 12:11:32 +0200 Subject: [PATCH 92/93] some correction in the Mpi_isend and mpi_irecv --- examples/fheatdis.f90 | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/examples/fheatdis.f90 b/examples/fheatdis.f90 index a0975e000..14845040b 100644 --- a/examples/fheatdis.f90 +++ b/examples/fheatdis.f90 @@ -7,12 +7,12 @@ program heat use FTI use MPI - real(8), parameter :: PREC = 0.001 - integer, parameter :: ITER_TIMES = 1000 + real(8), parameter :: PREC = 0.005 + integer, parameter :: ITER_TIMES = 2000 integer, parameter :: ITER_OUT = 100 - integer, parameter :: WORKTAG = 26 - integer, parameter :: REDUCE = 8 - integer, parameter :: MEM_MB = 64 + integer, parameter :: WORKTAG = 50 + integer, parameter :: REDUCE = 5 + integer, parameter :: MEM_MB = 32 integer, target :: rank, nbProcs, iter, row, col, err, FTI_comm_world integer, pointer :: ptriter @@ -39,7 +39,7 @@ program heat if ( rank == 0 ) then print '("Local data size is ",I5," x ",I5," = ",F5.0," MB (",I5,").")', & row, col, memSize, MEM_MB - print '("Target precision : ",F9.0)', PREC + print '("Target precision : ",F9.5)', PREC endif ptriter => iter @@ -56,7 +56,7 @@ program heat call doWork(nbProcs, rank, g, h, localerror) if ( ( mod(iter, ITER_OUT) == 0 ) .and. (rank == 0) ) then - print '("Step : ",I5,", error = ",F9.0)', iter, globalerror + print '("Step : ",I5,", error = ",F9.5)', iter, globalerror endif if ( mod(iter, REDUCE) == 0 ) then call MPI_Allreduce(localerror, globalerror, 1, MPI_REAL8, MPI_MAX, FTI_comm_world, err) @@ -114,8 +114,10 @@ subroutine doWork(numprocs, rank, g, h, localerror) call MPI_Irecv(h(1,1), size(h, 1), MPI_REAL8, rank-1, WORKTAG, FTI_comm_world, req1(2), err) endif if ( rank < numprocs-1 ) then - call MPI_Isend(g(ubound(g, 1)-1, 1), size(g, 1), MPI_REAL8, rank+1, WORKTAG, FTI_comm_world, req2(1), err) - call MPI_Irecv(h(ubound(h, 1) , 1), size(h, 1), MPI_REAL8, rank+1, WORKTAG, FTI_comm_world, req2(2), err) + call MPI_Isend(g(1,ubound(g, 2)-1), size(g, 1), MPI_REAL8, rank+1, WORKTAG, FTI_comm_world, req2(1), err) + call MPI_Irecv(h(1,ubound(h, 2)), size(h, 1), MPI_REAL8, rank+1, WORKTAG, FTI_comm_world, req2(2), err) + !call MPI_Isend(g(ubound(g, 1)-1, 1), size(g, 1), MPI_REAL8, rank+1, WORKTAG, FTI_comm_world, req2(1), err) + !call MPI_Irecv(h(ubound(h, 1) , 1), size(h, 1), MPI_REAL8, rank+1, WORKTAG, FTI_comm_world, req2(2), err) endif if ( rank > 0 ) then call MPI_Waitall(2, req1, status1, err) From 3dbc3d51391339344af273f27485551c5c8992b8 Mon Sep 17 00:00:00 2001 From: Leonardo Bautista Gomez Date: Thu, 9 Jun 2016 17:06:53 +0200 Subject: [PATCH 93/93] Adding Slawek to the author list. --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index f4680f58a..960ca2792 100644 --- a/AUTHORS +++ b/AUTHORS @@ -5,6 +5,7 @@ Author: Collaborators: Julien BIGOT + Slawomir Zdanowski Adele VILLIERMET Sheng DI Faysal BOUI