From 151fce05057efd8d5a442ffed93a08b3566fbce9 Mon Sep 17 00:00:00 2001 From: Kai Keller Date: Fri, 21 Sep 2018 13:44:46 +0200 Subject: [PATCH 1/3] adjusted travis unit test ckpt_tag -> general_tag --- test/ckptHierarchy/ckptHierarchy.c | 4 ++-- test/cornerCases/consistency.c | 6 +++--- test/diffSizes.c | 4 ++-- test/nodeFlag.c | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/test/ckptHierarchy/ckptHierarchy.c b/test/ckptHierarchy/ckptHierarchy.c index 5e228b4af..80c1414dd 100644 --- a/test/ckptHierarchy/ckptHierarchy.c +++ b/test/ckptHierarchy/ckptHierarchy.c @@ -24,7 +24,7 @@ void simulateCrash() { dictionary* ini = iniparser_load("config.fti"); int heads = (int)iniparser_getint(ini, "Basic:head", -1); int nodeSize = (int)iniparser_getint(ini, "Basic:node_size", -1); - int ckpt_tag = (int)iniparser_getint(ini, "Advanced:ckpt_tag", 711); + int general_tag = (int)iniparser_getint(ini, "Advanced:general_tag", 2612); int final_tag = (int)iniparser_getint(ini, "Advanced:final_tag", 3107); int res; if (checkpoint_level[3] != 1) { @@ -43,7 +43,7 @@ void simulateCrash() { } if (isInline == 0) { //waiting untill head do Post-checkpointing - MPI_Recv(&res, 1, MPI_INT, global_world_rank - (global_world_rank%nodeSize) , ckpt_tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Recv(&res, 1, MPI_INT, global_world_rank - (global_world_rank%nodeSize) , general_tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } } iniparser_freedict(ini); diff --git a/test/cornerCases/consistency.c b/test/cornerCases/consistency.c index 9202659f4..48bdd373a 100644 --- a/test/cornerCases/consistency.c +++ b/test/cornerCases/consistency.c @@ -26,7 +26,7 @@ void simulateCrash() { dictionary* ini = iniparser_load("config.fti"); int heads = (int)iniparser_getint(ini, "Basic:head", -1); int nodeSize = (int)iniparser_getint(ini, "Basic:node_size", -1); - int ckpt_tag = (int)iniparser_getint(ini, "Advanced:ckpt_tag", 711); + int general_tag = (int)iniparser_getint(ini, "Advanced:general_tag", 2612); int final_tag = (int)iniparser_getint(ini, "Advanced:final_tag", 3107); int res; if (checkpoint_level != 1) { @@ -46,7 +46,7 @@ void simulateCrash() { if (isInline == 0) { //waiting untill head do Post-checkpointing printf("%d: Receiving.\n", world_rank); - MPI_Recv(&res, 1, MPI_INT, global_world_rank - (global_world_rank%nodeSize) , ckpt_tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Recv(&res, 1, MPI_INT, global_world_rank - (global_world_rank%nodeSize) , general_tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE); printf("%d: Received.\n", world_rank); } } @@ -74,7 +74,7 @@ void simulateCrashWithoutCkpt() { dictionary* ini = iniparser_load("config.fti"); int heads = (int)iniparser_getint(ini, "Basic:head", -1); int nodeSize = (int)iniparser_getint(ini, "Basic:node_size", -1); - int ckpt_tag = (int)iniparser_getint(ini, "Advanced:ckpt_tag", 711); + int general_tag = (int)iniparser_getint(ini, "Advanced:general_tag", 2612); int final_tag = (int)iniparser_getint(ini, "Advanced:final_tag", 3107); int res; iniparser_freedict(ini); diff --git a/test/diffSizes.c b/test/diffSizes.c index 9fc91ad19..8537fe144 100644 --- a/test/diffSizes.c +++ b/test/diffSizes.c @@ -328,7 +328,7 @@ int main(int argc, char** argv) int heads = (int)iniparser_getint(ini, "Basic:head", -1); int nodeSize = (int)iniparser_getint(ini, "Basic:node_size", -1); int final_tag = (int)iniparser_getint(ini, "Advanced:final_tag", 3107); - int ckpt_tag = (int)iniparser_getint(ini, "Advanced:final_tag", 711); + int general_tag = (int)iniparser_getint(ini, "Advanced:general_tag", 2612); int res; if (checkpoint_level != 1) { int isInline = -1; @@ -346,7 +346,7 @@ int main(int argc, char** argv) } if (isInline == 0) { //waiting untill head do Post-checkpointing - MPI_Recv(&res, 1, MPI_INT, global_world_rank - (global_world_rank%nodeSize) , ckpt_tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Recv(&res, 1, MPI_INT, global_world_rank - (global_world_rank%nodeSize) , general_tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } } iniparser_freedict(ini); diff --git a/test/nodeFlag.c b/test/nodeFlag.c index d1adb6c37..f2c0560e5 100644 --- a/test/nodeFlag.c +++ b/test/nodeFlag.c @@ -147,11 +147,11 @@ int main(int argc, char** argv) int heads = (int)iniparser_getint(ini, "Basic:head", -1); int isInlineL4 = (int)iniparser_getint(ini, "Basic:inline_l4", 1); int final_tag = (int)iniparser_getint(ini, "Advanced:final_tag", 3107); - int ckpt_tag = (int)iniparser_getint(ini, "Advanced:final_tag", 711); + int general_tag = (int)iniparser_getint(ini, "Advanced:general_tag", 2612); if (heads > 0 && !isInlineL4) { //waiting untill head do Post-checkpointing int res; - MPI_Recv(&res, 1, MPI_INT, global_world_rank - (global_world_rank%nodeSize) , ckpt_tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Recv(&res, 1, MPI_INT, global_world_rank - (global_world_rank%nodeSize) , general_tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE); res = FTI_ENDW; //sending end of work MPI_Send(&res, 1, MPI_INT, global_world_rank - (global_world_rank%nodeSize), final_tag, MPI_COMM_WORLD); From c039c1316364812433233c3d1a6acc107ef0227f Mon Sep 17 00:00:00 2001 From: kellekai Date: Sun, 23 Sep 2018 12:07:35 +0200 Subject: [PATCH 2/3] Update hdf5Test.c ckpt_tag to general_tag --- test/hdf5Test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/hdf5Test.c b/test/hdf5Test.c index b8a2f9e5e..307fb3acf 100644 --- a/test/hdf5Test.c +++ b/test/hdf5Test.c @@ -476,7 +476,7 @@ int main(int argc, char** argv) { int heads = (int)iniparser_getint(ini, "Basic:head", -1); int nodeSize = (int)iniparser_getint(ini, "Basic:node_size", -1); int final_tag = (int)iniparser_getint(ini, "Advanced:final_tag", 3107); - int ckpt_tag = (int)iniparser_getint(ini, "Advanced:final_tag", 711); + int general_tag = (int)iniparser_getint(ini, "Advanced:general_tag", 2612); int res; if (checkpoint_level != 1) { int isInline = -1; @@ -494,7 +494,7 @@ int main(int argc, char** argv) { } if (isInline == 0) { //waiting untill head do Post-checkpointing - MPI_Recv(&res, 1, MPI_INT, global_world_rank - (global_world_rank % nodeSize) , ckpt_tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Recv(&res, 1, MPI_INT, global_world_rank - (global_world_rank % nodeSize) , general_tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } } iniparser_freedict(ini); From 9b322e0439f7e627acbb6e76fd38ca7c7990d38e Mon Sep 17 00:00:00 2001 From: Kai Keller Date: Wed, 26 Sep 2018 16:53:07 +0200 Subject: [PATCH 3/3] corrected merge --- src/api.c | 11 +- src/checkpoint.c | 16 +-- src/conf.c | 298 ++++++++++++++++++++++----------------------- src/ftiff.c | 8 +- src/ftiff.h | 12 +- src/interface.h | 24 ++-- src/stage.c | 3 +- src/tools.c | 7 +- test/local/check.c | 29 ++++- 9 files changed, 216 insertions(+), 192 deletions(-) diff --git a/src/api.c b/src/api.c index 1467fd282..90cf01325 100644 --- a/src/api.c +++ b/src/api.c @@ -995,6 +995,7 @@ int FTI_BitFlip(int datasetID) /*-------------------------------------------------------------------------*/ int FTI_Checkpoint(int id, int level) { + char str[FTI_BUFS]; //For console output if (FTI_Exec.initSCES == 0) { @@ -1031,7 +1032,7 @@ int FTI_Checkpoint(int id, int level) double t0 = MPI_Wtime(); //Start time if (FTI_Exec.wasLastOffline == 1) { // Block until previous checkpoint is done (Async. work) int lastLevel; - MPI_Recv(&lastLevel, 1, MPI_INT, FTI_Topo.headRank, FTI_Conf.ckptTag, FTI_Exec.globalComm, MPI_STATUS_IGNORE); + MPI_Recv(&lastLevel, 1, MPI_INT, FTI_Topo.headRank, FTI_Conf.generalTag, FTI_Exec.globalComm, MPI_STATUS_IGNORE); if (lastLevel != FTI_NSCS) { //Head sends level of checkpoint if post-processing succeed, FTI_NSCS Otherwise FTI_Exec.lastCkptLvel = lastLevel; //Store last successful post-processing checkpoint level sprintf(str, "LastCkptLvel received from head: %d", lastLevel); @@ -1092,9 +1093,9 @@ int FTI_Checkpoint(int id, int level) } else { strncpy(headInfo->ckptFile, FTI_Exec.meta[0].ckptFile, FTI_BUFS); } - MPI_Send(headInfo, 1, FTIFF_MpiTypes[FTIFF_HEAD_INFO], FTI_Topo.headRank, FTI_Conf.ckptTag, FTI_Exec.globalComm); - MPI_Send(FTI_Exec.meta[0].varID, headInfo->nbVar, MPI_INT, FTI_Topo.headRank, FTI_Conf.ckptTag, FTI_Exec.globalComm); - MPI_Send(FTI_Exec.meta[0].varSize, headInfo->nbVar, MPI_LONG, FTI_Topo.headRank, FTI_Conf.ckptTag, FTI_Exec.globalComm); + MPI_Send(headInfo, 1, FTIFF_MpiTypes[FTIFF_HEAD_INFO], FTI_Topo.headRank, FTI_Conf.generalTag, FTI_Exec.globalComm); + MPI_Send(FTI_Exec.meta[0].varID, headInfo->nbVar, MPI_INT, FTI_Topo.headRank, FTI_Conf.generalTag, FTI_Exec.globalComm); + MPI_Send(FTI_Exec.meta[0].varSize, headInfo->nbVar, MPI_LONG, FTI_Topo.headRank, FTI_Conf.generalTag, FTI_Exec.globalComm); free(headInfo); } @@ -1375,7 +1376,7 @@ int FTI_Finalize() // If there is remaining work to do for last checkpoint if (FTI_Exec.wasLastOffline == 1) { int lastLevel; - MPI_Recv(&lastLevel, 1, MPI_INT, FTI_Topo.headRank, FTI_Conf.ckptTag, FTI_Exec.globalComm, MPI_STATUS_IGNORE); + MPI_Recv(&lastLevel, 1, MPI_INT, FTI_Topo.headRank, FTI_Conf.generalTag, FTI_Exec.globalComm, MPI_STATUS_IGNORE); if (lastLevel != FTI_NSCS) { //Head sends level of checkpoint if post-processing succeed, FTI_NSCS Otherwise FTI_Exec.lastCkptLvel = lastLevel; } diff --git a/src/checkpoint.c b/src/checkpoint.c index 868627e0b..a24a44a65 100644 --- a/src/checkpoint.c +++ b/src/checkpoint.c @@ -207,6 +207,7 @@ int FTI_WriteCkpt(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, } switch (FTI_Conf->ioMode) { case FTI_IO_FTIFF: + res = FTI_Try(FTIFF_WriteFTIFF(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, FTI_Data), "write checkpoint using FTI-FF."); break; #ifdef ENABLE_HDF5 //If HDF5 is installed @@ -390,17 +391,16 @@ int FTI_Listen(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, FTI_Print("Head waits for message...", FTI_DBUG); + MPI_Iprobe( MPI_ANY_SOURCE, FTI_Conf->finalTag, FTI_Exec->globalComm, &finalize_flag, &finalize_status ); if ( FTI_Conf->stagingEnabled ) { MPI_Iprobe( MPI_ANY_SOURCE, FTI_Conf->stageTag, FTI_Exec->nodeComm, &stage_flag, &stage_status ); } MPI_Iprobe( MPI_ANY_SOURCE, FTI_Conf->ckptTag, FTI_Exec->globalComm, &ckpt_flag, &ckpt_status ); - MPI_Iprobe( MPI_ANY_SOURCE, FTI_Conf->finalTag, FTI_Exec->globalComm, &finalize_flag, &finalize_status ); - if( ckpt_flag ) { // head will process the whole checkpoint // (treated second due to priority) - FTI_HandleCkptRequest( FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt ); + FTI_HandleCkptRequest( FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt ); ckpt_flag = 0; continue; @@ -467,7 +467,7 @@ int FTI_Listen(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, /*-------------------------------------------------------------------------*/ int FTI_HandleCkptRequest(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt) -{ +{ char str[FTI_BUFS]; //For console output int flags[7]; //Increment index if get corresponding value from application process //(index (1 - 4): checkpoint level; index 5: stops head; index 6: reject checkpoint) @@ -503,15 +503,15 @@ int FTI_HandleCkptRequest(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec int k; for (i = 0; i < FTI_Topo->nbApprocs; i++) { // Iterate on the application processes in the node k = i+1; - MPI_Recv(&(headInfo[i]), 1, FTIFF_MpiTypes[FTIFF_HEAD_INFO], FTI_Topo->body[i], FTI_Conf->ckptTag, FTI_Exec->globalComm, MPI_STATUS_IGNORE); + MPI_Recv(&(headInfo[i]), 1, FTIFF_MpiTypes[FTIFF_HEAD_INFO], FTI_Topo->body[i], FTI_Conf->generalTag, FTI_Exec->globalComm, MPI_STATUS_IGNORE); FTI_Exec->meta[0].exists[k] = headInfo[i].exists; FTI_Exec->meta[0].nbVar[k] = headInfo[i].nbVar; FTI_Exec->meta[0].maxFs[k] = headInfo[i].maxFs; FTI_Exec->meta[0].fs[k] = headInfo[i].fs; FTI_Exec->meta[0].pfs[k] = headInfo[i].pfs; isDcpCnt += headInfo[i].isDcp; - MPI_Recv(&(FTI_Exec->meta[0].varID[k * FTI_BUFS]), headInfo[i].nbVar, MPI_INT, FTI_Topo->body[i], FTI_Conf->ckptTag, FTI_Exec->globalComm, MPI_STATUS_IGNORE); - MPI_Recv(&(FTI_Exec->meta[0].varSize[k * FTI_BUFS]), headInfo[i].nbVar, MPI_LONG, FTI_Topo->body[i], FTI_Conf->ckptTag, FTI_Exec->globalComm, MPI_STATUS_IGNORE); + MPI_Recv(&(FTI_Exec->meta[0].varID[k * FTI_BUFS]), headInfo[i].nbVar, MPI_INT, FTI_Topo->body[i], FTI_Conf->generalTag, FTI_Exec->globalComm, MPI_STATUS_IGNORE); + MPI_Recv(&(FTI_Exec->meta[0].varSize[k * FTI_BUFS]), headInfo[i].nbVar, MPI_LONG, FTI_Topo->body[i], FTI_Conf->generalTag, FTI_Exec->globalComm, MPI_STATUS_IGNORE); strncpy(&(FTI_Exec->meta[0].ckptFile[k * FTI_BUFS]), headInfo[i].ckptFile , FTI_BUFS); sscanf(&(FTI_Exec->meta[0].ckptFile[k * FTI_BUFS]), "Ckpt%d", &FTI_Exec->ckptID); } @@ -551,7 +551,7 @@ int FTI_HandleCkptRequest(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec res = FTI_NSCS; } for (i = 0; i < FTI_Topo->nbApprocs; i++) { // Send msg. to avoid checkpoint collision - MPI_Send(&res, 1, MPI_INT, FTI_Topo->body[i], FTI_Conf->ckptTag, FTI_Exec->globalComm); + MPI_Send(&res, 1, MPI_INT, FTI_Topo->body[i], FTI_Conf->generalTag, FTI_Exec->globalComm); } return FTI_SCES; } diff --git a/src/conf.c b/src/conf.c index 4c1a79d35..827121763 100644 --- a/src/conf.c +++ b/src/conf.c @@ -400,178 +400,178 @@ int FTI_TestConfig(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo, } return FTI_SCES; - } +} - /*-------------------------------------------------------------------------*/ - /** - @brief It tests that the directories given is correct. - @param FTI_Conf Configuration metadata. - @param FTI_Topo Topology metadata. - @return integer FTI_SCES if successful. +/*-------------------------------------------------------------------------*/ +/** + @brief It tests that the directories given is correct. + @param FTI_Conf Configuration metadata. + @param FTI_Topo Topology metadata. + @return integer FTI_SCES if successful. - This function tests that the directories given in the FTI configuration - are correct. + This function tests that the directories given in the FTI configuration + are correct. - **/ - /*-------------------------------------------------------------------------*/ - int FTI_TestDirectories(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo) - { - char str[FTI_BUFS]; //For console output + **/ +/*-------------------------------------------------------------------------*/ +int FTI_TestDirectories(FTIT_configuration* FTI_Conf, FTIT_topology* FTI_Topo) +{ + char str[FTI_BUFS]; //For console output - // Checking local directory - snprintf(str, FTI_BUFS, "Checking the local directory (%s)...", FTI_Conf->localDir); + // Checking local directory + snprintf(str, FTI_BUFS, "Checking the local directory (%s)...", FTI_Conf->localDir); + FTI_Print(str, FTI_DBUG); + if (mkdir(FTI_Conf->localDir, 0777) == -1) { + if (errno != EEXIST) { + FTI_Print("The local directory could NOT be created.", FTI_WARN); + return FTI_NSCS; + } + } + + if (FTI_Topo->myRank == 0) { + // Checking metadata directory + snprintf(str, FTI_BUFS, "Checking the metadata directory (%s)...", FTI_Conf->metadDir); FTI_Print(str, FTI_DBUG); - if (mkdir(FTI_Conf->localDir, 0777) == -1) { + if (mkdir(FTI_Conf->metadDir, 0777) == -1) { if (errno != EEXIST) { - FTI_Print("The local directory could NOT be created.", FTI_WARN); + FTI_Print("The metadata directory could NOT be created.", FTI_WARN); return FTI_NSCS; } } - if (FTI_Topo->myRank == 0) { - // Checking metadata directory - snprintf(str, FTI_BUFS, "Checking the metadata directory (%s)...", FTI_Conf->metadDir); - FTI_Print(str, FTI_DBUG); - if (mkdir(FTI_Conf->metadDir, 0777) == -1) { - if (errno != EEXIST) { - FTI_Print("The metadata directory could NOT be created.", FTI_WARN); - return FTI_NSCS; - } - } - - // Checking global directory - snprintf(str,FTI_BUFS, "Checking the global directory (%s)...", FTI_Conf->glbalDir); - FTI_Print(str, FTI_DBUG); - if (mkdir(FTI_Conf->glbalDir, 0777) == -1) { - if (errno != EEXIST) { - FTI_Print("The global directory could NOT be created.", FTI_WARN); - return FTI_NSCS; - } + // Checking global directory + snprintf(str,FTI_BUFS, "Checking the global directory (%s)...", FTI_Conf->glbalDir); + FTI_Print(str, FTI_DBUG); + if (mkdir(FTI_Conf->glbalDir, 0777) == -1) { + if (errno != EEXIST) { + FTI_Print("The global directory could NOT be created.", FTI_WARN); + return FTI_NSCS; } } - //Waiting for metadDir being created - MPI_Barrier(FTI_COMM_WORLD); - - return FTI_SCES; } + //Waiting for metadDir being created + MPI_Barrier(FTI_COMM_WORLD); - /*-------------------------------------------------------------------------*/ - /** - @brief It creates the directories required for current execution. - @param FTI_Conf Configuration metadata. - @param FTI_Exec Execution metadata. - @param FTI_Topo Topology metadata. - @param FTI_Ckpt Checkpoint metadata. - @return integer FTI_SCES if successful. - - This function creates the temporary metadata, local and global - directories required for the current execution. - - **/ - /*-------------------------------------------------------------------------*/ - int FTI_CreateDirs(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, - FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt) - { - char strerr[FTI_BUFS]; - char fn[FTI_BUFS]; //Path of metadata directory - - // Create metadata timestamp directory - snprintf(fn, FTI_BUFS, "%s/%s", FTI_Conf->metadDir, FTI_Exec->id); - if (mkdir(fn, 0777) == -1) { - if (errno != EEXIST) { - FTI_Print("Cannot create metadata timestamp directory", FTI_EROR); - } + return FTI_SCES; +} + +/*-------------------------------------------------------------------------*/ +/** + @brief It creates the directories required for current execution. + @param FTI_Conf Configuration metadata. + @param FTI_Exec Execution metadata. + @param FTI_Topo Topology metadata. + @param FTI_Ckpt Checkpoint metadata. + @return integer FTI_SCES if successful. + + This function creates the temporary metadata, local and global + directories required for the current execution. + + **/ +/*-------------------------------------------------------------------------*/ +int FTI_CreateDirs(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt) +{ + char strerr[FTI_BUFS]; + char fn[FTI_BUFS]; //Path of metadata directory + + // Create metadata timestamp directory + snprintf(fn, FTI_BUFS, "%s/%s", FTI_Conf->metadDir, FTI_Exec->id); + if (mkdir(fn, 0777) == -1) { + if (errno != EEXIST) { + FTI_Print("Cannot create metadata timestamp directory", FTI_EROR); } - snprintf(FTI_Conf->metadDir, FTI_BUFS, "%s", fn); - snprintf(FTI_Conf->mTmpDir, FTI_BUFS, "%s/tmp", fn); - snprintf(FTI_Ckpt[1].metaDir, FTI_BUFS, "%s/l1", fn); - snprintf(FTI_Ckpt[2].metaDir, FTI_BUFS, "%s/l2", fn); - snprintf(FTI_Ckpt[3].metaDir, FTI_BUFS, "%s/l3", fn); - snprintf(FTI_Ckpt[4].metaDir, FTI_BUFS, "%s/l4", fn); - - // Create global checkpoint timestamp directory - snprintf(fn, FTI_BUFS, "%s", FTI_Conf->glbalDir); - snprintf(FTI_Conf->glbalDir, FTI_BUFS, "%s/%s", fn, FTI_Exec->id); - if (mkdir(FTI_Conf->glbalDir, 0777) == -1) { - if (errno != EEXIST) { - FTI_Print("Cannot create global checkpoint timestamp directory", FTI_EROR); - } + } + snprintf(FTI_Conf->metadDir, FTI_BUFS, "%s", fn); + snprintf(FTI_Conf->mTmpDir, FTI_BUFS, "%s/tmp", fn); + snprintf(FTI_Ckpt[1].metaDir, FTI_BUFS, "%s/l1", fn); + snprintf(FTI_Ckpt[2].metaDir, FTI_BUFS, "%s/l2", fn); + snprintf(FTI_Ckpt[3].metaDir, FTI_BUFS, "%s/l3", fn); + snprintf(FTI_Ckpt[4].metaDir, FTI_BUFS, "%s/l4", fn); + + // Create global checkpoint timestamp directory + snprintf(fn, FTI_BUFS, "%s", FTI_Conf->glbalDir); + snprintf(FTI_Conf->glbalDir, FTI_BUFS, "%s/%s", fn, FTI_Exec->id); + if (mkdir(FTI_Conf->glbalDir, 0777) == -1) { + if (errno != EEXIST) { + FTI_Print("Cannot create global checkpoint timestamp directory", FTI_EROR); } - snprintf(FTI_Conf->gTmpDir, FTI_BUFS, "%s/tmp", FTI_Conf->glbalDir); - snprintf(FTI_Ckpt[4].dcpDir, FTI_BUFS, "%s/dCP", FTI_Conf->glbalDir); - snprintf(FTI_Ckpt[4].dcpName, FTI_BUFS, "dCPFile-Rank%d.fti", FTI_Topo->myRank); - snprintf(FTI_Ckpt[4].dir, FTI_BUFS, "%s/l4", FTI_Conf->glbalDir); - snprintf(FTI_Ckpt[4].archDir, FTI_BUFS, "%s/l4_archive", FTI_Conf->glbalDir); - if ( FTI_Conf->keepL4Ckpt ) { - if (mkdir(FTI_Ckpt[4].archDir, (mode_t) 0777) == -1) { - if (errno != EEXIST) { - snprintf(strerr, FTI_BUFS, "failed to create directory '%s', cannot keep L4 checkpoint.", FTI_Ckpt[4].archDir); - FTI_Print(strerr, FTI_EROR); - FTI_Conf->keepL4Ckpt = false; - } + } + snprintf(FTI_Conf->gTmpDir, FTI_BUFS, "%s/tmp", FTI_Conf->glbalDir); + snprintf(FTI_Ckpt[4].dcpDir, FTI_BUFS, "%s/dCP", FTI_Conf->glbalDir); + snprintf(FTI_Ckpt[4].dcpName, FTI_BUFS, "dCPFile-Rank%d.fti", FTI_Topo->myRank); + snprintf(FTI_Ckpt[4].dir, FTI_BUFS, "%s/l4", FTI_Conf->glbalDir); + snprintf(FTI_Ckpt[4].archDir, FTI_BUFS, "%s/l4_archive", FTI_Conf->glbalDir); + if ( FTI_Conf->keepL4Ckpt ) { + if (mkdir(FTI_Ckpt[4].archDir, (mode_t) 0777) == -1) { + if (errno != EEXIST) { + snprintf(strerr, FTI_BUFS, "failed to create directory '%s', cannot keep L4 checkpoint.", FTI_Ckpt[4].archDir); + FTI_Print(strerr, FTI_EROR); + FTI_Conf->keepL4Ckpt = false; } } + } - // Create local checkpoint timestamp directory - if (FTI_Conf->test) { // If local test generate name by topology - snprintf(fn, FTI_BUFS, "%s/node%d", FTI_Conf->localDir, FTI_Topo->myRank / FTI_Topo->nodeSize); - if (mkdir(fn, 0777) == -1) { - if (errno != EEXIST) { - FTI_Print("Cannot create local checkpoint timestamp directory", FTI_EROR); - } - } - } - else { - snprintf(fn, FTI_BUFS, "%s", FTI_Conf->localDir); - } - snprintf(FTI_Conf->localDir, FTI_BUFS, "%s/%s", fn, FTI_Exec->id); - if (mkdir(FTI_Conf->localDir, 0777) == -1) { + // Create local checkpoint timestamp directory + if (FTI_Conf->test) { // If local test generate name by topology + snprintf(fn, FTI_BUFS, "%s/node%d", FTI_Conf->localDir, FTI_Topo->myRank / FTI_Topo->nodeSize); + if (mkdir(fn, 0777) == -1) { if (errno != EEXIST) { FTI_Print("Cannot create local checkpoint timestamp directory", FTI_EROR); } } - snprintf(FTI_Conf->lTmpDir, FTI_BUFS, "%s/tmp", FTI_Conf->localDir); - snprintf(FTI_Ckpt[1].dir, FTI_BUFS, "%s/l1", FTI_Conf->localDir); - snprintf(FTI_Ckpt[1].dcpDir, FTI_BUFS, "%s/dCP", FTI_Conf->localDir); - snprintf(FTI_Ckpt[2].dir, FTI_BUFS, "%s/l2", FTI_Conf->localDir); - snprintf(FTI_Ckpt[3].dir, FTI_BUFS, "%s/l3", FTI_Conf->localDir); - return FTI_SCES; } - - /*-------------------------------------------------------------------------*/ - /** - @brief It reads and tests the configuration given. - @param FTI_Conf Configuration metadata. - @param FTI_Exec Execution metadata. - @param FTI_Topo Topology metadata. - @param FTI_Ckpt Checkpoint metadata. - @param FTI_Inje Type to describe failure injections in FTI. - @return integer FTI_SCES if successful. - - This function reads the configuration file. Then test that the - configuration parameters are correct (including directories). - - **/ - /*-------------------------------------------------------------------------*/ - int FTI_LoadConf(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, - FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, - FTIT_injection *FTI_Inje) - { - int res = FTI_Try(FTI_ReadConf(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, FTI_Inje), "read configuration."); - if (res == FTI_NSCS) { - return FTI_NSCS; - } - res = FTI_Try(FTI_TestConfig(FTI_Conf, FTI_Topo, FTI_Ckpt, FTI_Exec), "pass the configuration test."); - if (res == FTI_NSCS) { - return FTI_NSCS; - } - res = FTI_Try(FTI_TestDirectories(FTI_Conf, FTI_Topo), "pass the directories test."); - if (res == FTI_NSCS) { - return FTI_NSCS; - } - res = FTI_Try(FTI_CreateDirs(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt), "create checkpoint directories."); - if (res == FTI_NSCS) { - return FTI_NSCS; + else { + snprintf(fn, FTI_BUFS, "%s", FTI_Conf->localDir); + } + snprintf(FTI_Conf->localDir, FTI_BUFS, "%s/%s", fn, FTI_Exec->id); + if (mkdir(FTI_Conf->localDir, 0777) == -1) { + if (errno != EEXIST) { + FTI_Print("Cannot create local checkpoint timestamp directory", FTI_EROR); } - return FTI_SCES; } + snprintf(FTI_Conf->lTmpDir, FTI_BUFS, "%s/tmp", FTI_Conf->localDir); + snprintf(FTI_Ckpt[1].dir, FTI_BUFS, "%s/l1", FTI_Conf->localDir); + snprintf(FTI_Ckpt[1].dcpDir, FTI_BUFS, "%s/dCP", FTI_Conf->localDir); + snprintf(FTI_Ckpt[2].dir, FTI_BUFS, "%s/l2", FTI_Conf->localDir); + snprintf(FTI_Ckpt[3].dir, FTI_BUFS, "%s/l3", FTI_Conf->localDir); + return FTI_SCES; +} + +/*-------------------------------------------------------------------------*/ +/** + @brief It reads and tests the configuration given. + @param FTI_Conf Configuration metadata. + @param FTI_Exec Execution metadata. + @param FTI_Topo Topology metadata. + @param FTI_Ckpt Checkpoint metadata. + @param FTI_Inje Type to describe failure injections in FTI. + @return integer FTI_SCES if successful. + + This function reads the configuration file. Then test that the + configuration parameters are correct (including directories). + + **/ +/*-------------------------------------------------------------------------*/ +int FTI_LoadConf(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, + FTIT_injection *FTI_Inje) +{ + int res = FTI_Try(FTI_ReadConf(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt, FTI_Inje), "read configuration."); + if (res == FTI_NSCS) { + return FTI_NSCS; + } + res = FTI_Try(FTI_TestConfig(FTI_Conf, FTI_Topo, FTI_Ckpt, FTI_Exec), "pass the configuration test."); + if (res == FTI_NSCS) { + return FTI_NSCS; + } + res = FTI_Try(FTI_TestDirectories(FTI_Conf, FTI_Topo), "pass the directories test."); + if (res == FTI_NSCS) { + return FTI_NSCS; + } + res = FTI_Try(FTI_CreateDirs(FTI_Conf, FTI_Exec, FTI_Topo, FTI_Ckpt), "create checkpoint directories."); + if (res == FTI_NSCS) { + return FTI_NSCS; + } + return FTI_SCES; +} diff --git a/src/ftiff.c b/src/ftiff.c index 5af86e5d2..1c9a98c17 100644 --- a/src/ftiff.c +++ b/src/ftiff.c @@ -40,23 +40,23 @@ #include "interface.h" -/** +/* +-------------------------------------------------------------------------+ | STATIC TYPE DECLARATIONS | +-------------------------------------------------------------------------+ - **/ +*/ MPI_Datatype FTIFF_MpiTypes[FTIFF_NUM_MPI_TYPES]; -/** +/* +-------------------------------------------------------------------------+ | FUNCTION DEFINITIONS | +-------------------------------------------------------------------------+ - **/ +*/ /*-------------------------------------------------------------------------*/ /** diff --git a/src/ftiff.h b/src/ftiff.h index 37fcb229b..5fe77d710 100644 --- a/src/ftiff.h +++ b/src/ftiff.h @@ -63,13 +63,13 @@ printf( "%s:%d[DEBUG-%d] " MSG "\n", __FILENAME__,__LINE__,rank, ##__VA_ARGS__); \ } while (0) -/* +/** +-------------------------------------------------------------------------+ | FTI-FF TYPES | +-------------------------------------------------------------------------+ -*/ + **/ /** @typedef dcpBLK_t * @brief unsigned short (0 - 65535). @@ -126,13 +126,13 @@ typedef struct FTIFF_L3Info { long RSfs; // maxFs } FTIFF_L3Info; -/* +/** +-------------------------------------------------------------------------+ | MPI DERIVED DATA TYPES | +-------------------------------------------------------------------------+ -*/ + **/ // ID MPI types enum { @@ -153,13 +153,13 @@ typedef struct FTIFF_MPITypeInfo { MPI_Aint* mbrDisp; } FTIFF_MPITypeInfo; -/* +/** +-------------------------------------------------------------------------+ | FUNCTION DECLARATIONS | +-------------------------------------------------------------------------+ -*/ + **/ void FTIFF_InitMpiTypes(); int FTIFF_DeserializeFileMeta( FTIFF_metaInfo* meta, char* buffer_ser ); diff --git a/src/interface.h b/src/interface.h index f62b73e67..6f1117fa5 100644 --- a/src/interface.h +++ b/src/interface.h @@ -145,13 +145,13 @@ int FTI_LoadConf(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, FTIT_injection *FTI_Inje); #ifdef ENABLE_HDF5 - int FTI_WriteHDF5(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, - FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, - FTIT_dataset* FTI_Data); - int FTI_RecoverHDF5(FTIT_execution* FTI_Exec, FTIT_checkpoint* FTI_Ckpt, - FTIT_dataset* FTI_Data); - int FTI_RecoverVarHDF5(FTIT_execution* FTI_Exec, FTIT_checkpoint* FTI_Ckpt, - FTIT_dataset* FTI_Data, int id); +int FTI_WriteHDF5(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, + FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, + FTIT_dataset* FTI_Data); +int FTI_RecoverHDF5(FTIT_execution* FTI_Exec, FTIT_checkpoint* FTI_Ckpt, + FTIT_dataset* FTI_Data); +int FTI_RecoverVarHDF5(FTIT_execution* FTI_Exec, FTIT_checkpoint* FTI_Ckpt, + FTIT_dataset* FTI_Data, int id); #endif int FTI_GetChecksums(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, @@ -226,11 +226,11 @@ void FTI_MallocMeta(FTIT_execution* FTI_Exec, FTIT_topology* FTI_Topo); void FTI_FreeMeta(FTIT_execution* FTI_Exec); void FTI_FreeTypesAndGroups(FTIT_execution* FTI_Exec); #ifdef ENABLE_HDF5 - void FTI_CreateComplexType(FTIT_type* ftiType, FTIT_type** FTI_Type); - void FTI_CloseComplexType(FTIT_type* ftiType, FTIT_type** FTI_Type); - void FTI_CreateGroup(FTIT_H5Group* ftiGroup, hid_t parentGroup, FTIT_H5Group** FTI_Group); - void FTI_OpenGroup(FTIT_H5Group* ftiGroup, hid_t parentGroup, FTIT_H5Group** FTI_Group); - void FTI_CloseGroup(FTIT_H5Group* ftiGroup, FTIT_H5Group** FTI_Group); +void FTI_CreateComplexType(FTIT_type* ftiType, FTIT_type** FTI_Type); +void FTI_CloseComplexType(FTIT_type* ftiType, FTIT_type** FTI_Type); +void FTI_CreateGroup(FTIT_H5Group* ftiGroup, hid_t parentGroup, FTIT_H5Group** FTI_Group); +void FTI_OpenGroup(FTIT_H5Group* ftiGroup, hid_t parentGroup, FTIT_H5Group** FTI_Group); +void FTI_CloseGroup(FTIT_H5Group* ftiGroup, FTIT_H5Group** FTI_Group); #endif int FTI_InitGroupsAndTypes(FTIT_execution* FTI_Exec); int FTI_InitBasicTypes(FTIT_dataset* FTI_Data); diff --git a/src/stage.c b/src/stage.c index 3e57dba24..4c854dcab 100644 --- a/src/stage.c +++ b/src/stage.c @@ -555,7 +555,8 @@ int FTI_FreeStageRequest( FTIT_execution *FTI_Exec, FTIT_topology *FTI_Topo, int -1, else. **/ /*-------------------------------------------------------------------------*/ -int FTI_GetRequestIdx( int ID ) { +int FTI_GetRequestIdx( int ID ) +{ if ( FTI_GetRequestField( ID, FTI_SIF_ALL ) ) { return FTI_GetRequestField( ID, FTI_SIF_IDX ); diff --git a/src/tools.c b/src/tools.c index 5db858b41..eea7520a6 100644 --- a/src/tools.c +++ b/src/tools.c @@ -395,7 +395,8 @@ void FTI_FreeMeta(FTIT_execution* FTI_Exec) **/ /*-------------------------------------------------------------------------*/ -int FTI_InitGroupsAndTypes(FTIT_execution* FTI_Exec) { +int FTI_InitGroupsAndTypes(FTIT_execution* FTI_Exec) +{ FTI_Exec->FTI_Type = malloc(sizeof(FTIT_type*) * FTI_BUFS); if (FTI_Exec->FTI_Type == NULL) { return FTI_NSCS; @@ -427,7 +428,8 @@ int FTI_InitGroupsAndTypes(FTIT_execution* FTI_Exec) { **/ /*-------------------------------------------------------------------------*/ -void FTI_FreeTypesAndGroups(FTIT_execution* FTI_Exec) { +void FTI_FreeTypesAndGroups(FTIT_execution* FTI_Exec) +{ int i; for (i = 0; i < FTI_Exec->nbType; i++) { if (FTI_Exec->FTI_Type[i]->structure != NULL) { @@ -702,7 +704,6 @@ int FTI_InitBasicTypes(FTIT_dataset* FTI_Data) **/ /*-------------------------------------------------------------------------*/ - int FTI_RmDir(char path[FTI_BUFS], int flag) { if (flag) { diff --git a/test/local/check.c b/test/local/check.c index 7e93a358e..1307b221d 100644 --- a/test/local/check.c +++ b/test/local/check.c @@ -40,6 +40,8 @@ #include #include #include +#include "../../deps/iniparser/iniparser.h" +#include "../../deps/iniparser/dictionary.h" #define N 100000 #define CNTRLD_EXIT 10 @@ -145,6 +147,19 @@ int main(int argc, char* argv[]) { MPI_Comm_rank(FTI_COMM_WORLD,&FTI_APP_RANK); + dictionary *ini = iniparser_load( argv[1] ); + int grank; + MPI_Comm_rank(MPI_COMM_WORLD,&grank); + int nbHeads = (int)iniparser_getint(ini, "Basic:head", -1); + int finalTag = (int)iniparser_getint(ini, "Advanced:final_tag", 3107); + int nodeSize = (int)iniparser_getint(ini, "Basic:node_size", -1); + int headRank = grank - grank%nodeSize; + + if ( (nbHeads<0) || (nodeSize<0) ) { + printf("wrong configuration (for head or node-size settings)!\n"); + MPI_Abort(MPI_COMM_WORLD, -1); + } + asize = N; if (diff_sizes) { @@ -195,11 +210,17 @@ int main(int argc, char* argv[]) { if (state == INIT) { init_arrays(A, B, asize); write_data(B, &asize, FTI_APP_RANK); - MPI_Barrier(FTI_COMM_WORLD); FTI_Checkpoint(1,level); - sleep(5); - if (crash && FTI_APP_RANK == 0) { - exit(CNTRLD_EXIT); + MPI_Barrier(FTI_COMM_WORLD); + + if ( crash ) { + if( nbHeads > 0 ) { + int value = FTI_ENDW; + MPI_Send(&value, 1, MPI_INT, headRank, finalTag, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + } + MPI_Finalize(); + exit(0); } }