diff --git a/doc-dev/rst/developers/containers.rst b/doc-dev/rst/developers/containers.rst deleted file mode 100644 index 17e18df7..00000000 --- a/doc-dev/rst/developers/containers.rst +++ /dev/null @@ -1,97 +0,0 @@ -Containers -========== - -NOTE: This feature is experimental and not yet complete, so it it not -documented in the user guide. - -SCR requires checkpoint data to be stored primarily as a file per -process. However, writing a large number of files is inefficient or -difficult to manage on some file systems. To alleviate this problem, SCR -provides an abstraction called “containers”. When writing data to or -reading data from the prefix directory, SCR combines multiple -application files into a container. Containers are disabled by default. -To enable them, set the ``SCR_USE_CONTAINERS`` parameter to 1. - -During a flush, SCR identifies the containers and the offsets within -those containers where each file should be stored. SCR records the -file-to-container mapping in the rank2file map, which it later -references to extract files during the fetch operation. - -A container has a maximum size, which is determined by the -``SCR_CONTAINER_SIZE`` parameter. This parameter defaults to 100GB. -Application file data is packed sequentially within a container until -the container is full, and then the remaining data spills over to the -next container. The total number of containers required depends on the -total number of bytes in the dataset and the container size. A container -file name is of the form ``ctr..scr``, where ```` is the -container id which counts up from 0. All containers are written to the -dataset directory within the prefix directory. - -SCR combines files in an order such that all files on the same node are -grouped sequentially. This limits the number of files that each compute -node must access. For this purpose, SCR creates two global communicators -during ``SCR_Init``. Both are defined in ``scr_globals.c``. The -``scr_comm_node`` communicator consists of all processes on the same -compute node. The ``scr_comm_node_across`` communicator consists of all -processes having the same rank within ``scr_comm_node``. Note that some -process has rank 0 in ``scr_comm_node`` for each node in the run. This -process is called the “node leader”. - -To get the offset where each process should write its data, SCR first -sums up the sizes of all files on the node via a reduce on -``scr_comm_node``. The node leaders then execute a scan across nodes -using the ``scr_comm_node_across`` communicator to get a node offset. A -final scan within ``scr_comm_node`` produces the offset at which each -process should write its data. - -TODO: discuss setting in flush descriptor stored in filemap under -dataset id and rank - -TODO: discuss containers during a scavenge - -TODO: should we copy redundancy data to containers as well? - -Within a rank2file map file, the file-to-container map adds entries -under the ``SEG`` key for each file. An example entry looks like the -following: - -:: - - rank_2.ckpt - SEG - 0 - FILE - .scr/ctr.1.scr - OFFSET - 224295 - LENGTH - 75705 - 1 - FILE - .scr/ctr.2.scr - OFFSET - 0 - LENGTH - 300000 - 2 - FILE - .scr/ctr.3.scr - OFFSET - 0 - LENGTH - 148591 - -The ``SEG`` key specifies file data as a list of numbered segments -starting from 0. Each segment specifies the length of file data, and the -name and offset at which it can be found within a container file. -Reading all segments in order produces the full sequence of bytes that -make up the file. The name of the container file is given as a relative -path from the dataset directory. - -In the above example, the container size is set to 300000. This size is -smaller than normal to illustrate the various fields. The data for the -``rank_2.ckpt`` file is split among three segments. The first segment of -75705 bytes is in the container file named ``.scr/ctr.1.scr`` starting -at offset 224295. The next segment is 300000 bytes and is in -``.scr/ctr.2.scr`` starting at offset 0. The final segment of 148591 -bytes are in ``.scr/ctr.3.scr`` starting at offset 0. diff --git a/doc-dev/rst/index.rst b/doc-dev/rst/index.rst index 4eda37db..9cffe421 100644 --- a/doc-dev/rst/index.rst +++ b/doc-dev/rst/index.rst @@ -115,7 +115,6 @@ Contents developers/redundancy_descriptors.rst developers/schemes.rst developers/scheme_xor.rst - developers/containers.rst developers/drain.rst developers/logging.rst diff --git a/examples/test_api.c b/examples/test_api.c index 4ed7f588..a415bc00 100644 --- a/examples/test_api.c +++ b/examples/test_api.c @@ -606,7 +606,7 @@ void print_usage() printf(" --noscr Disable SCR calls\n"); printf(" --noscrrestart Disable SCR restart calls\n"); printf(" --shared-file Use single shared file instead of file per rank"); - printf(" --global-store= Specify DIR as a global storage location for cache"); + printf(" --global-cache= Specify DIR as a global storage location for cache"); printf(" -h, --help Print usage\n"); printf("\n"); return; @@ -634,7 +634,7 @@ int main (int argc, char* argv[]) {"noscr", no_argument, NULL, 'x'}, {"noscrrestart", no_argument, NULL, 'X'}, {"shared-file", no_argument, NULL, 'y'}, - {"global-store", required_argument, NULL, 'Y'}, + {"global-cache", required_argument, NULL, 'Y'}, {"help", no_argument, NULL, 'h'}, {NULL, no_argument, NULL, 0} }; @@ -643,7 +643,7 @@ int main (int argc, char* argv[]) int long_index = 0; int opt = getopt_long(argc, argv, opt_string, long_options, &long_index); char* current = NULL; - char* global_store = NULL; + char* global_cache = NULL; unsigned long long val; while (opt != -1) { switch(opt) { @@ -691,7 +691,7 @@ int main (int argc, char* argv[]) use_shared_file = 1; break; case 'Y': - global_store = strdup(optarg); + global_cache = strdup(optarg); break; case 'h': default: @@ -730,8 +730,10 @@ int main (int argc, char* argv[]) /* For a global cache, one must define a STORE descriptor * and declare the path to have WORLD access, e.g., * SCR_Config("STORE=/lustre/$USER/scrcache GROUP=WORLD"); */ - if (global_store != NULL) { - SCR_Configf("STORE=%s GROUP=WORLD", global_store); + if (global_cache != NULL) { + SCR_Configf("STORE=%s GROUP=WORLD", global_cache); + SCR_Configf("SCR_CACHE_BASE=%s", global_cache); + SCR_Config("SCR_CACHE_BYPASS=0"); } if (SCR_Init() != SCR_SUCCESS){ @@ -830,9 +832,9 @@ int main (int argc, char* argv[]) } } - if (global_store != NULL) { - free(global_store); - global_store = NULL; + if (global_cache != NULL) { + free(global_cache); + global_cache = NULL; } if (current != NULL) { diff --git a/src/scr.c b/src/scr.c index d04c2de8..a0019c72 100644 --- a/src/scr.c +++ b/src/scr.c @@ -1593,9 +1593,11 @@ static int scr_start_output(const char* name, int flags) return SCR_SUCCESS; } -/* detect files that have been registered by more than one process, - * drop filemap entries from all but one process */ -static int scr_assign_ownership(scr_filemap* map, const scr_reddesc* rd) +/* rank ralative ownership of the filemap since multiple processes may + * have been writing to it. The rankings are used later for portions of + * the SCR implementation that require access to be made in the context of a + * single process. */ +static int scr_rank_ownership(scr_filemap* map, const scr_reddesc* rd) { int rc = SCR_SUCCESS; @@ -1667,7 +1669,13 @@ static int scr_assign_ownership(scr_filemap* map, const scr_reddesc* rd) /* keep rank 0 for each file as its owner, remove any entry from the filemap * for which we are not rank 0 */ int multiple_owner = 0; + for (i = 0; i < count; i++) { + scr_meta* meta = scr_meta_new(); + + scr_filemap_get_meta(map, mapfiles[i], meta); + scr_meta_set_group_rank(meta, group_rank[i]); + /* check whether this file exists on multiple ranks */ if (group_ranks[i] > 1) { /* found the same file on more than one rank */ @@ -1681,11 +1689,8 @@ static int scr_assign_ownership(scr_filemap* map, const scr_reddesc* rd) } } - /* only keep entry for this file in filemap if we're the - * first rank in the set of ranks that have this file */ - if (group_rank[i] != 0) { - scr_filemap_remove_file(map, mapfiles[i]); - } + scr_filemap_set_meta(map, mapfiles[i], meta); + scr_meta_delete(&meta); } /* fatal error if any file is on more than one rank @@ -1737,13 +1742,16 @@ static int scr_complete_output(int valid) time_start = MPI_Wtime(); } - /* When using bypass mode or shared cache, we allow different procs to write to the same file, - * in which case, both should have registered the file in Route_file and thus - * have an entry in the file map. The proper thing to do here is to list the - * set of ranks that share a file, however, that requires fixing up lots of - * other parts of the code. For now, ensure that at most one rank lists the - * file in their file map. */ - rc = scr_assign_ownership(scr_map, scr_rd); + /* When using bypass mode or shared cache, we allow different procs to write + * to the same file, in which case, both should have registered the file in + * Route_file and thus have an entry in the file map. + * + * Calling scr_rank_ownership allow us to distinguish between the files that + * were created by a single process versus the ones that are shared between + * many. Further, this function also designate a single process (rank) that + * may be used in the cases where there needs to be only once process + * doing work on the file. */ + rc = scr_rank_ownership(scr_map, scr_rd); /* count number of files, number of bytes, and record filesize for each file * as written by this process */ @@ -1756,6 +1764,15 @@ static int scr_complete_output(int valid) { /* get the filename */ char* file = kvtree_elem_key(elem); + + /* + * For now, we continue to process files as if they are only written by + * a single process. We will open this up soon once we have updated + * AXL to take advantage of it. */ + if ( ! scr_filemap_leader_rank(scr_map, file) ) { + continue; + } + my_counts[0]++; /* start with valid flag from caller for this file */ @@ -1787,6 +1804,7 @@ static int scr_complete_output(int valid) /* fill in filesize and complete flag in the meta data for the file */ scr_meta* meta = scr_meta_new(); scr_filemap_get_meta(scr_map, file, meta); + scr_meta_set_filesize(meta, filesize); scr_meta_set_complete(meta, file_valid); if (stat_rc == 0) { @@ -1879,6 +1897,7 @@ static int scr_complete_output(int valid) } /* apply redundancy scheme if we're still valid */ + if (rc == SCR_SUCCESS) { rc = scr_reddesc_apply(scr_map, scr_rd, scr_dataset_id); } diff --git a/src/scr_cache.c b/src/scr_cache.c index dbf55fee..91107819 100644 --- a/src/scr_cache.c +++ b/src/scr_cache.c @@ -282,6 +282,10 @@ int scr_cache_delete(scr_cache_index* cindex, int id) /* get the filename */ char* file = kvtree_elem_key(file_elem); + if (! scr_filemap_leader_rank(map, file) ) { + continue; + } + /* verify that file mtime and ctime have not changed since scr_complete_output, * which could idenitfy a bug in the user's code */ struct stat statbuf; @@ -653,6 +657,10 @@ int scr_cache_check_files(const scr_cache_index* cindex, int id) /* get the filename */ char* file = kvtree_elem_key(file_elem); + if ( ! scr_filemap_leader_rank(map, file) ) { + continue; + } + /* check that we can read the file */ if (scr_file_is_readable(file) != SCR_SUCCESS) { failed_read = 1; diff --git a/src/scr_filemap.c b/src/scr_filemap.c index b7d69517..01a4297c 100644 --- a/src/scr_filemap.c +++ b/src/scr_filemap.c @@ -311,3 +311,18 @@ int scr_filemap_write(const spath* file, const scr_filemap* map) return SCR_SUCCESS; } + +/* True if calling rank is designated leader for file */ +int scr_filemap_leader_rank(scr_filemap* map, const char *file) +{ + scr_meta* meta = scr_meta_new(); + int group_rank; + + scr_filemap_get_meta(map, file, meta); + scr_meta_get_group_rank(meta, &group_rank); + + scr_meta_delete(&meta); + + return group_rank == 0; +} + diff --git a/src/scr_filemap.h b/src/scr_filemap.h index e30c249d..4a240030 100644 --- a/src/scr_filemap.h +++ b/src/scr_filemap.h @@ -131,4 +131,7 @@ scr_filemap* scr_filemap_new(void); /* free memory resources assocaited with filemap */ int scr_filemap_delete(scr_filemap** ptr_map); +/* True if calling rank is designated leader for file */ +int scr_filemap_leader_rank(scr_filemap* map, const char *file); + #endif diff --git a/src/scr_flush.c b/src/scr_flush.c index 5714ccf1..ed31ab2a 100644 --- a/src/scr_flush.c +++ b/src/scr_flush.c @@ -270,6 +270,10 @@ int scr_flush_prepare(const scr_cache_index* cindex, int id, kvtree* file_list) /* get the filename */ char* file = kvtree_elem_key(elem); + if ( ! scr_filemap_leader_rank(map, file) ) { + continue; + } + /* read meta data for file and attach it to file list */ scr_meta* meta = scr_meta_new(); if (scr_filemap_get_meta(map, file, meta) == SCR_SUCCESS) { diff --git a/src/scr_keys.h b/src/scr_keys.h index 5d3432fb..74cc3679 100644 --- a/src/scr_keys.h +++ b/src/scr_keys.h @@ -177,6 +177,8 @@ Define common hash key strings #define SCR_META_KEY_CKPT ("CKPT") #define SCR_META_KEY_RANKS ("RANKS") #define SCR_META_KEY_RANK ("RANK") +#define SCR_META_KEY_GROUP_RANKS ("GROUP_RANKS") +#define SCR_META_KEY_GROUP_RANK ("GROUP_RANK") #define SCR_META_KEY_ORIG ("ORIG") #define SCR_META_KEY_PATH ("PATH") #define SCR_META_KEY_NAME ("NAME") diff --git a/src/scr_meta.c b/src/scr_meta.c index 3d796c5d..e9f4340b 100644 --- a/src/scr_meta.c +++ b/src/scr_meta.c @@ -157,6 +157,13 @@ int scr_meta_set_crc32(scr_meta* meta, uLong crc) return (rc == KVTREE_SUCCESS) ? SCR_SUCCESS : SCR_FAILURE; } +/* set the rank relative to the group for file */ +int scr_meta_set_group_rank(scr_meta* meta, int group_rank) +{ + int rc = kvtree_util_set_int(meta, SCR_META_KEY_GROUP_RANK, group_rank); + return (rc == KVTREE_SUCCESS) ? SCR_SUCCESS : SCR_FAILURE; +} + static void scr_stat_get_atimes(const struct stat* sb, uint64_t* secs, uint64_t* nsecs) { *secs = (uint64_t) sb->st_atime; @@ -306,6 +313,13 @@ int scr_meta_get_crc32(const scr_meta* meta, uLong* crc) return (rc == KVTREE_SUCCESS) ? SCR_SUCCESS : SCR_FAILURE; } +/* get the rank relative to the group for file */ +int scr_meta_get_group_rank(const scr_meta* meta, int* group_rank) +{ + int rc = kvtree_util_get_int(meta, SCR_META_KEY_GROUP_RANK, group_rank); + return (rc == KVTREE_SUCCESS) ? SCR_SUCCESS : SCR_FAILURE; +} + /* ========================================= Check field values diff --git a/src/scr_meta.h b/src/scr_meta.h index c97eb2c0..b5870513 100644 --- a/src/scr_meta.h +++ b/src/scr_meta.h @@ -97,6 +97,9 @@ int scr_meta_set_stat(scr_meta* meta, struct stat* statbuf); /* set the crc32 field on meta */ int scr_meta_set_crc32(scr_meta* meta, uLong crc); +/* set the rank relative to the group for file */ +int scr_meta_set_group_rank(scr_meta* meta, int group_rank); + /* ========================================= Get field values @@ -130,6 +133,9 @@ int scr_meta_get_complete(const scr_meta* meta, int* complete); /* get the crc32 field in meta data, returns SCR_SUCCESS if a field is set */ int scr_meta_get_crc32(const scr_meta* meta, uLong* crc); +/* get the rank relative to the group for file */ +int scr_meta_get_group_rank(const scr_meta* meta, int* group_rank); + /* ========================================= Check field values diff --git a/src/scr_reddesc.c b/src/scr_reddesc.c index 16d72229..3349cfb1 100644 --- a/src/scr_reddesc.c +++ b/src/scr_reddesc.c @@ -553,6 +553,11 @@ int scr_reddesc_apply( /* get the filename */ char* file = kvtree_elem_key(file_elem); + /* Skip over shared files that we are not leaders of */ + if ( ! scr_filemap_leader_rank(map, file) ) { + continue; + } + /* check the file */ if (! scr_bool_have_file(map, file)) { scr_dbg(2, "File determined to be invalid: %s", file); @@ -638,6 +643,11 @@ int scr_reddesc_apply( /* get the filename */ char* file = kvtree_elem_key(file_elem); + /* Skip over shared files that we are not leaders of */ + if ( ! scr_filemap_leader_rank(map, file) ) { + continue; + } + /* add file to the set */ if (ER_Add(set_id, file) != ER_SUCCESS) { scr_err("Failed to add file to ER set: %s @ %s:%d", file, __FILE__, __LINE__); diff --git a/src/scr_util.c b/src/scr_util.c index 0f6f18b8..9bdd2ed5 100644 --- a/src/scr_util.c +++ b/src/scr_util.c @@ -14,6 +14,7 @@ #include "scr.h" #include "scr_err.h" #include "scr_io.h" +#include "scr_filemap.h" #include "scr_util.h" #include diff --git a/src/scr_util.h b/src/scr_util.h index 51208fca..3c9096e5 100644 --- a/src/scr_util.h +++ b/src/scr_util.h @@ -41,6 +41,7 @@ #include "spath.h" #include "kvtree.h" +#include "scr_filemap.h" /* given a string, convert it to a double and write that value to val */ int scr_atod(const char* str, double* val);