diff --git a/src/mpi/romio/adio/Makefile.mk b/src/mpi/romio/adio/Makefile.mk index 10c3ec9f600..49160b36447 100644 --- a/src/mpi/romio/adio/Makefile.mk +++ b/src/mpi/romio/adio/Makefile.mk @@ -35,5 +35,6 @@ include $(top_srcdir)/adio/ad_pvfs2/Makefile.mk include $(top_srcdir)/adio/ad_testfs/Makefile.mk include $(top_srcdir)/adio/ad_ufs/Makefile.mk include $(top_srcdir)/adio/ad_xfs/Makefile.mk +include $(top_srcdir)/adio/ad_logfs/Makefile.mk include $(top_srcdir)/adio/common/Makefile.mk diff --git a/src/mpi/romio/adio/ad_logfs/ADIO_DESIGN b/src/mpi/romio/adio/ad_logfs/ADIO_DESIGN new file mode 100644 index 00000000000..797cf923217 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ADIO_DESIGN @@ -0,0 +1,45 @@ +* Independent vs collective open + + LogFS cannot reuse the existing file handle, since it is unable to set the view + in a reliable way from an independent call. (Which could be needed for example, if an + independent replay is needed) + + For now this would work, since ADIO_Set_view only modifies some AD_File variables; + However, for the new two phase, if I'm not mistaken, their was some communication on + set_view to precalculate some data. + + The real questions: + 1° Is ADIOI_Set_view collective or not? (in ALL ADIO drivers?) + 2° What about independent open? What about a replay on a CPU that didn't yet open the + file? + + +* Difficult (for the layering): + ADIO_Open doesn't have the MPI_Info even though MPI_File_open has it; + -> at open time the info is not available; only afterwards set_info + is called + +* Strange: + -> ADIOI_xxx_SetInfo is called BEFORE ADIOI_xxxx_Open is called! + annoying, because ADIOI_xxx_Open can allocate the + fs_ptr structure (which might be used to store the hints) + however, it is only allocated on ADIOI_xxx_Open! + + As a consequence, FS-specific data has to be in the general + ADIO_Fd structure (See union in ADIOI_Hints_strict ); + + Also, not calling ADIOI_Gen_SetInfo makes open crash + (because hints->cb_config_list remains 0 and ADIOI_cb_config_list_parse + assumes it is not 0 ) + +* Uses_generic_read | write + + -> should use properties / flags / ... instead of function addrs + + + +* data sieving: + For example, adio_open ("shared" function) + plays with access_mode (lies about it) just to be able to do data sieving + -> means that PVFS2 files in wr-only mode are also always openend in rdwr + mode, even though not needed diff --git a/src/mpi/romio/adio/ad_logfs/Makefile.mk b/src/mpi/romio/adio/ad_logfs/Makefile.mk new file mode 100644 index 00000000000..791321b78fd --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/Makefile.mk @@ -0,0 +1,45 @@ +## -*- Mode: Makefile; -*- +## vim: set ft=automake : +## +## (C) 2011 by Argonne National Laboratory. +## See COPYRIGHT in top-level directory. +## + +if BUILD_AD_LOGFS + +AM_CPPFLAGS += -I$(top_srcdir)/adio/ad_logfs + +noinst_HEADERS += adio/ad_logfs/ad_logfs_common.h\ + adio/ad_logfs/ad_logfs.h\ + adio/ad_logfs/logfs_file.h\ + adio/ad_logfs/logfs.h\ + adio/ad_logfs/logfs_info.h\ + adio/ad_logfs/logfs_rtree.h\ + adio/ad_logfs/logfs_user.h\ + adio/ad_logfs/rtree_config.h + +romio_other_sources += adio/ad_logfs/ad_logfs.c \ + adio/ad_logfs/ad_logfs_close.c \ + adio/ad_logfs/ad_logfs_common.c\ + adio/ad_logfs/ad_logfs_delete.c \ + adio/ad_logfs/ad_logfs_done.c\ + adio/ad_logfs/ad_logfs_fcntl.c\ + adio/ad_logfs/ad_logfs_flush.c\ + adio/ad_logfs/ad_logfs_getsh.c\ + adio/ad_logfs/ad_logfs_hints.c\ + adio/ad_logfs/ad_logfs_iread.c\ + adio/ad_logfs/ad_logfs_iwrite.c\ + adio/ad_logfs/ad_logfs_open.c\ + adio/ad_logfs/ad_logfs_rdcoll.c \ + adio/ad_logfs/ad_logfs_read.c\ + adio/ad_logfs/ad_logfs_resize.c\ + adio/ad_logfs/ad_logfs_seek.c\ + adio/ad_logfs/ad_logfs_setsh.c\ + adio/ad_logfs/ad_logfs_wait.c\ + adio/ad_logfs/ad_logfs_wrcoll.c\ + adio/ad_logfs/ad_logfs_write.c\ + adio/ad_logfs/logfs.c\ + adio/ad_logfs/logfs_file.c\ + adio/ad_logfs/logfs_rtree.c\ + adio/ad_logfs/ad_logfs_features.c +endif BUILD_AD_LOGFS diff --git a/src/mpi/romio/adio/ad_logfs/PROBLEMS b/src/mpi/romio/adio/ad_logfs/PROBLEMS new file mode 100644 index 00000000000..2e4618a9b67 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/PROBLEMS @@ -0,0 +1,9 @@ +* how to deal with delete? file is not open, so no view can be set, so we + cannot override the delete function to delete the logfiles too... + + +* multiple opens + -> share logfiles? + + + diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs.c b/src/mpi/romio/adio/ad_logfs/ad_logfs.c new file mode 100644 index 00000000000..5d061acae21 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs.c @@ -0,0 +1,42 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * $Id: ad_logfs.c,v 1.2 2002/10/24 17:01:03 gropp Exp $ + * + * Copyright (C) 2001 University of Chicago. + * See COPYRIGHT notice in top-level directory. + */ + +#include "ad_logfs.h" + +/* adioi.h has the ADIOI_Fns_struct define */ +#include "adioi.h" + +struct ADIOI_Fns_struct ADIO_LOGFS_operations = { + ADIOI_LOGFS_Open, /* Open */ + ADIOI_GEN_OpenColl, /* OpenColl */ + ADIOI_LOGFS_ReadContig, /* ReadContig */ + ADIOI_LOGFS_WriteContig, /* WriteContig */ + ADIOI_LOGFS_ReadStridedColl, /* ReadStridedColl */ + ADIOI_LOGFS_WriteStridedColl, /* WriteStridedColl */ + ADIOI_LOGFS_SeekIndividual, /* SeekIndividual */ + ADIOI_LOGFS_Fcntl, /* Fcntl */ + ADIOI_LOGFS_SetInfo, /* SetInfo */ + ADIOI_LOGFS_ReadStrided, /* ReadStrided */ + ADIOI_LOGFS_WriteStrided, /* WriteStrided */ + ADIOI_LOGFS_Close, /* Close */ + ADIOI_LOGFS_IreadContig, /* IreadContig */ + ADIOI_LOGFS_IwriteContig, /* IwriteContig */ + ADIOI_GEN_IODone, /* ReadDone */ + ADIOI_GEN_IODone, /* WriteDone */ + ADIOI_GEN_IOComplete, /* ReadComplete */ + ADIOI_GEN_IOComplete, /* WriteComplete */ + ADIOI_LOGFS_IreadStrided, /* IreadStrided */ + ADIOI_LOGFS_IwriteStrided, /* IwriteStrided */ + ADIOI_LOGFS_Flush, /* Flush */ + ADIOI_LOGFS_Resize, /* Resize */ + ADIOI_LOGFS_Delete, /* Delete */ + ADIOI_LOGFS_Feature, /* Features */ + "LOGFS: logging layer for ROMIO drivers", + ADIOI_GEN_IreadStridedColl, /* IreadStridedColl */ + ADIOI_GEN_IwriteStridedColl /* IwriteStridedColl */ +}; diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs.h b/src/mpi/romio/adio/ad_logfs/ad_logfs.h new file mode 100644 index 00000000000..186a71c0258 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs.h @@ -0,0 +1,82 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * $Id: ad_testfs.h,v 1.2 2002/10/24 17:01:03 gropp Exp $ + * + * Copyright (C) 2001 University of Chicago. + * See COPYRIGHT notice in top-level directory. + */ + +#ifndef AD_LOGFS_INCLUDE +#define AD_LOGFS_INCLUDE + +#include +#include +#include +#include +#include "adio.h" + +void ADIOI_LOGFS_Open(ADIO_File fd, int *error_code); +void ADIOI_LOGFS_Close(ADIO_File fd, int *error_code); +void ADIOI_LOGFS_ReadContig(ADIO_File fd, void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status * status, int + *error_code); +void ADIOI_LOGFS_WriteContig(ADIO_File fd, const void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status * status, int + *error_code); +void ADIOI_LOGFS_IwriteContig(ADIO_File fd, const void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Request * request, int + *error_code); +void ADIOI_LOGFS_IreadContig(ADIO_File fd, void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Request * request, int + *error_code); +int ADIOI_LOGFS_ReadDone(ADIO_Request * request, ADIO_Status * status, int + *error_code); +int ADIOI_LOGFS_WriteDone(ADIO_Request * request, ADIO_Status * status, int + *error_code); +void ADIOI_LOGFS_ReadComplete(ADIO_Request * request, ADIO_Status * status, int + *error_code); +void ADIOI_LOGFS_WriteComplete(ADIO_Request * request, ADIO_Status * status, int *error_code); +void ADIOI_LOGFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t * fcntl_struct, int *error_code); +void ADIOI_LOGFS_WriteStrided(ADIO_File fd, const void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status * status, int *error_code); +void ADIOI_LOGFS_ReadStrided(ADIO_File fd, void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status * status, int + *error_code); +void ADIOI_LOGFS_WriteStridedColl(ADIO_File fd, const void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status * status, int + *error_code); +void ADIOI_LOGFS_ReadStridedColl(ADIO_File fd, void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status * status, int + *error_code); +void ADIOI_LOGFS_IreadStrided(ADIO_File fd, void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Request * request, int + *error_code); +void ADIOI_LOGFS_IwriteStrided(ADIO_File fd, const void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Request * request, int + *error_code); +void ADIOI_LOGFS_Flush(ADIO_File fd, int *error_code); +void ADIOI_LOGFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code); +ADIO_Offset ADIOI_LOGFS_SeekIndividual(ADIO_File fd, ADIO_Offset offset, + int whence, int *error_code); +void ADIOI_LOGFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code); +void ADIOI_LOGFS_Get_shared_fp(ADIO_File fd, int size, ADIO_Offset * shared_fp, int *error_code); +void ADIOI_LOGFS_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code); +void ADIOI_LOGFS_Delete(const char *filename, int *error_code); + + +void ADIOI_LOGFS_Set_slave(ADIO_File fd, ADIOI_Fns * slaveops); + +int ADIOI_LOGFS_Feature(ADIO_File fd, int flag); + + +#endif diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs_close.c b/src/mpi/romio/adio/ad_logfs/ad_logfs_close.c new file mode 100644 index 00000000000..16aed492609 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs_close.c @@ -0,0 +1,32 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * $Id: ad_testfs_close.c,v 1.2 2002/10/24 17:01:03 gropp Exp $ + * + * Copyright (C) 2001 University of Chicago. + * See COPYRIGHT notice in top-level directory. + */ + +#include "ad_logfs.h" +#include "adioi.h" +#include "layered.h" +#include "logfs.h" + + + +void ADIOI_LOGFS_Close(ADIO_File fd, int *error_code) +{ + int standalone; + + standalone = logfs_standalone(fd); + + /* deactivate logfs ; + * If replay_on_close is set, there will first be a replay */ + logfs_deactivate(fd); + + *error_code = MPI_SUCCESS; + + /* layering (if any) is disabled in logfs_deactivate + * so we just need to call the close function of the slave */ + if (!standalone) + fd->fns->ADIOI_xxx_Close(fd, error_code); +} diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs_common.c b/src/mpi/romio/adio/ad_logfs/ad_logfs_common.c new file mode 100644 index 00000000000..bcd4e74fb6e --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs_common.c @@ -0,0 +1,94 @@ +#include +#include +#include "adio.h" +#include "adioi.h" +#include "ad_logfs_common.h" + +int ad_logfs_checkbool(const char *buf) +{ + if (!buf || !buf[0]) + return 0; + ADIOI_Strlower((char *) buf); + + if (!strcmp(buf, "1")) + return 1; + if (!strcmp(buf, "true")) + return 1; + + return 0; +} + + +int ad_logfs_hint_bool(MPI_Info info, const char *key, int *val) +{ + int flag; + char buf[255]; + + if (info == MPI_INFO_NULL) + return 0; + + MPI_Info_get(info, (char *) key, sizeof(buf) - 1, &buf[0], &flag); + if (!flag) + return 0; + + *val = ad_logfs_checkbool(buf); + return 1; +} + +int ad_logfs_hint_int(MPI_Info info, const char *key, int *val) +{ + int flag; + char buf[255]; + + if (info == MPI_INFO_NULL) + return 0; + + MPI_Info_get(info, (char *) key, sizeof(buf) - 1, &buf[0], &flag); + if (!flag) + return 0; + + *val = atoi(buf); + return 1; +} + +int ad_logfs_hint_str(MPI_Info info, const char *key, char **str) +{ + int flag; + char buf[255]; + + if (info == MPI_INFO_NULL) + return 0; + + MPI_Info_get(info, (char *) key, sizeof(buf) - 1, &buf[0], &flag); + if (!flag) + return 0; + + if (*str) + ADIOI_Free(*str); + + *str = ADIOI_Strdup(buf); + + return 1; +} + +void ad_logfs_hint_set_bool(MPI_Info info, const char *key, int val) +{ + assert(info != MPI_INFO_NULL); + MPI_Info_set(info, (char *) key, (val ? "true" : "false")); +} + +void ad_logfs_hint_set_int(MPI_Info info, const char *key, int val) +{ + assert(info != MPI_INFO_NULL); + char buf[255]; + snprintf(buf, sizeof(buf) - 1, "%i", val); + MPI_Info_set(info, (char *) key, buf); +} + +/* if 'str' is NULL, info won't be set and key will not exist in info object */ +void ad_logfs_hint_set_str(MPI_Info info, const char *key, const char *str) +{ + assert(info != MPI_INFO_NULL); + if (str != NULL) + MPI_Info_set(info, (char *) key, (char *) str); +} diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs_common.h b/src/mpi/romio/adio/ad_logfs/ad_logfs_common.h new file mode 100644 index 00000000000..2bbc7b897a1 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs_common.h @@ -0,0 +1,22 @@ +#ifndef AD_LOGFS_COMMON_H +#define AD_LOGFS_COMMON_H + +/* Check str for TRUE, true, 1, ... ; Return the bool value of the string + * contents */ +int ad_logfs_checkbool(const char *str); + + +/* return true if the hint was set; if so, adjust val */ +int ad_logfs_hint_bool(MPI_Info info, const char *key, int *val); + +int ad_logfs_hint_int(MPI_Info info, const char *key, int *val); + +int ad_logfs_hint_str(MPI_Info info, const char *key, char **str); + + +/* Setting hints */ +void ad_logfs_hint_set_bool(MPI_Info info, const char *key, int val); +void ad_logfs_hint_set_int(MPI_Info info, const char *key, int val); +void ad_logfs_hint_set_str(MPI_Info info, const char *key, const char *str); + +#endif diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs_delete.c b/src/mpi/romio/adio/ad_logfs/ad_logfs_delete.c new file mode 100644 index 00000000000..1baf1a9182c --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs_delete.c @@ -0,0 +1,29 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * $Id: ad_testfs_delete.c,v 1.2 2002/10/24 17:01:03 gropp Exp $ + * + * Copyright (C) 2001 University of Chicago. + * See COPYRIGHT notice in top-level directory. + */ + +#include + +#include "ad_logfs.h" +#include "adioi.h" +#include "logfs.h" + +/* delete can only be called in standalone mode + * (unless we also modify the global delete to detect logfs files) */ +void ADIOI_LOGFS_Delete(const char *filename, int *error_code) +{ + *error_code = MPI_SUCCESS; + + /* logfs_delete removes all the logfs ancilary files (.logfs, .meta, + * .data) related to 'filename', but not the actual file */ + logfs_delete (filename); + + /* MPI_File_delete already stripped the prefix. TODO: How do we avoid a + * loop if we are using logfs as a data representation ? */ + *error_code = MPI_File_delete(filename, MPI_INFO_NULL); + +} diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs_done.c b/src/mpi/romio/adio/ad_logfs/ad_logfs_done.c new file mode 100644 index 00000000000..16d1a4f3f6b --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs_done.c @@ -0,0 +1,23 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * $Id: ad_testfs_done.c,v 1.3 2002/10/24 17:01:03 gropp Exp $ + * + * Copyright (C) 2001 University of Chicago. + * See COPYRIGHT notice in top-level directory. + */ + +#include "ad_logfs.h" +#include "adioi.h" +#include "layered.h" + +int ADIOI_LOGFS_ReadDone(ADIO_Request * request, ADIO_Status * status, int + *error_code) +{ + return 1; +} + +int ADIOI_LOGFS_WriteDone(ADIO_Request * request, ADIO_Status * status, int + *error_code) +{ + return 1; +} diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs_fcntl.c b/src/mpi/romio/adio/ad_logfs/ad_logfs_fcntl.c new file mode 100644 index 00000000000..4394c4ddd18 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs_fcntl.c @@ -0,0 +1,45 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * $Id: ad_testfs_fcntl.c,v 1.8 2004/11/01 21:36:58 robl Exp $ + * + * Copyright (C) 2001 University of Chicago. + * See COPYRIGHT notice in top-level directory. + */ + +#include +#include "logfs.h" +#include "ad_logfs.h" +#include "adioi.h" +#include "adio_extern.h" +#include "layered.h" + +void ADIOI_LOGFS_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t * fcntl_struct, int *error_code) +{ + static char myname[] = "ADIOI_LOGFS_FCNTL"; + + *error_code = MPI_SUCCESS; + + switch (flag) { + case ADIO_FCNTL_GET_FSIZE: + /* return filesize from memory */ + fcntl_struct->fsize = logfs_getfsize(fd); + break; + case ADIO_FCNTL_SET_DISKSPACE: + /* change into set_size */ + /* problem here: if size < current size, nothing happens + * with preallocate (so it is different from set_size) + * + * Will be problem in wr-only mode and withy tracking filesize, + * since getfsize will be illegal in this case */ + if (logfs_getfsize(fd) < fcntl_struct->fsize) + logfs_resize(fd, fcntl_struct->fsize); + *error_code = MPI_SUCCESS; + break; + case ADIO_FCNTL_SET_ATOMICITY: + default: + *error_code = MPIO_Err_create_code(MPI_SUCCESS, + MPIR_ERR_RECOVERABLE, + myname, __LINE__, + MPI_ERR_ARG, "**flag", "**flag %d", flag); + } +} diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs_features.c b/src/mpi/romio/adio/ad_logfs/ad_logfs_features.c new file mode 100644 index 00000000000..e7edc1e97a5 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs_features.c @@ -0,0 +1,28 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */ +/* + * + * (C) 2008 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + */ + +#include "adio.h" + +int ADIOI_LOGFS_Feature(ADIO_File fd, int flag) +{ + switch(flag) { + case ADIO_LOCKS: + case ADIO_DATA_SIEVING_WRITES: + case ADIO_UNLINK_AFTER_CLOSE: + case ADIO_TWO_PHASE: + case ADIO_SCALABLE_RESIZE: + return 1; + break; + case ADIO_SCALABLE_OPEN: + case ADIO_ATOMIC_MODE: + case ADIO_SHARED_FP: + default: + return 0; + break; + } + +} diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs_flush.c b/src/mpi/romio/adio/ad_logfs/ad_logfs_flush.c new file mode 100644 index 00000000000..4e942539efe --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs_flush.c @@ -0,0 +1,34 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * $Id: ad_testfs_flush.c,v 1.2 2002/10/24 17:01:04 gropp Exp $ + * + * Copyright (C) 2001 University of Chicago. + * See COPYRIGHT notice in top-level directory. + */ + +#include "ad_logfs.h" +#include "adioi.h" +#include "layered.h" +#include "logfs.h" + +void ADIOI_LOGFS_Flush(ADIO_File fd, int *error_code) +{ + void *handle; + + *error_code = MPI_SUCCESS; + + /* When reading is possible, forced to do log replay here to + * adhere to MPI file consistency rules + * + * If not, just force everything to the logfile + */ + /* flush */ + logfs_flush(fd); + + if (!logfs_standalone(fd)) { + /* also flush the real file */ + handle = ADIOI_Layer_switch_in(fd); + fd->fns->ADIOI_xxx_Flush(fd, error_code); + ADIOI_Layer_switch_out(fd, handle); + } +} diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs_getsh.c b/src/mpi/romio/adio/ad_logfs/ad_logfs_getsh.c new file mode 100644 index 00000000000..a071b092218 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs_getsh.c @@ -0,0 +1,31 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * $Id: ad_testfs_getsh.c,v 1.2 2002/10/24 17:01:04 gropp Exp $ + * + * Copyright (C) 2001 University of Chicago. + * See COPYRIGHT notice in top-level directory. + */ + +#include "ad_logfs.h" +#include "adioi.h" +#include "layered.h" + +void ADIOI_LOGFS_Get_shared_fp(ADIO_File fd, int size, ADIO_Offset * shared_fp, int *error_code) +{ + void *handle; + + *error_code = MPI_SUCCESS; + + /* not supported */ + assert(0); + + /* do we need to log set shared/ get shared? + * probably not */ + + /* will this work correctly when seeking past the end of the file? + * if not, we need to make setsize calls before doing this */ + + handle = ADIOI_Layer_switch_in(fd); + ADIO_Get_shared_fp(fd, size, shared_fp, error_code); + ADIOI_Layer_switch_out(fd, handle); +} diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs_hints.c b/src/mpi/romio/adio/ad_logfs/ad_logfs_hints.c new file mode 100644 index 00000000000..ead1ace5914 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs_hints.c @@ -0,0 +1,78 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * $Id: ad_logfs.hints.c,v 1.4 2002/10/24 17:01:04 gropp Exp $ + * + * Copyright (C) 2001 University of Chicago. + * See COPYRIGHT notice in top-level directory. + */ + +#include "ad_logfs.h" +#include "adioi.h" +#include "layered.h" +#include "logfs.h" + +void ADIOI_LOGFS_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) +{ + *error_code = MPI_SUCCESS; + + + /* In standalone mode we have complete control over hints, so just + * process the users' info structure and return; + * However, somehow we still need to call the gen_setinfo + * since otherwise there are segmentation faults in other parts + * of the code that depend on certaing things to be set + * (two-phase for example) */ + if (logfs_standalone(fd)) { + + /* this modifies fd->info */ + ADIOI_GEN_SetInfo(fd, users_info, error_code); + + if (fd->fs_ptr) { + /* We have a fs_ptr, so update our internal values and + * update fd->info (which is return by MPI_File_get_info) */ + logfs_setinfo(fd, users_info); + return; + } + else { + /* no fs-ptr yet; This must mean that we're in the process + * of opening a new file in standalone mode; + * ADIO_SetInfo is called *BEFORE* ADIO_Open is called; + * AS such, we don't have a fs_ptr structure, and we don't have + * anywhere to store our filesystem specific hints (except + * if we modify the union in ADIO_File and pollute it with + * fs-specific members; + * Instead we store transfer our hints into fd->info, + * and call ADIO_SetInfo once more from within ADIO_Open*/ + logfs_transfer_hints(users_info, fd->info); + } + *error_code = MPI_SUCCESS; + return; + } + + /* special case here: + * (not applicable for standalone mode) + * + * setinfo is called from within ADIO_Open (which returns the fd struct) + * so we cannot call ADIOI_Layer_init until after the call to ADIO_Open + * returns. + * + * The only function that is called from within open is the Setinfo + * call, so here we need to take into consideration that ADIO_Layer_init + * is not yet called; A sign of this is that fs_ptr is 0 + */ + + /* if we have a slave, pass the setinfo call, if not use the generic + * function to update the info argument which will be passed to the slave + * once it is set */ + if (fd->fs_ptr && ADIOI_Layer_is_slave_set(fd)) { + void *handle = ADIOI_Layer_switch_in(fd); + fd->fns->ADIOI_xxx_SetInfo(fd, users_info, error_code); + ADIOI_Layer_switch_out(fd, handle); + } + else { + /* this basically stores the hint in fd->info; we reuse this hint + * the moment we can call the slave's open; we call set_info on the slave + * (which takes a copy of the hint) and free the original hint*/ + ADIOI_GEN_SetInfo(fd, users_info, error_code); + } +} diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs_iread.c b/src/mpi/romio/adio/ad_logfs/ad_logfs_iread.c new file mode 100644 index 00000000000..39ff7393e95 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs_iread.c @@ -0,0 +1,47 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * $Id: ad_testfs_iread.c,v 1.3 2002/10/24 17:01:04 gropp Exp $ + * + * Copyright (C) 2001 University of Chicago. + * See COPYRIGHT notice in top-level directory. + */ + +#include "ad_logfs.h" +#include "adioi.h" +#include "layered.h" + +/* ADIOI_LOGFS_IreadContig() + * + * Implemented by immediately calling ReadContig() + */ +void ADIOI_LOGFS_IreadContig(ADIO_File fd, void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Request * request, int + *error_code) +{ + MPI_Count typesize; + + MPI_Status status; + + MPI_Type_size_x(datatype, &typesize); + *error_code = MPI_SUCCESS; + + ADIOI_LOGFS_ReadContig(fd, buf, count, datatype, file_ptr_type, + offset, &status, error_code); + MPIO_Completed_request_create(&fd, count*typesize, error_code, request); + +} + +void ADIOI_LOGFS_IreadStrided(ADIO_File fd, void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Request * request, int + *error_code) +{ + MPI_Status status; + MPI_Count typesize; + MPI_Type_size_x(datatype, &typesize); + + ADIOI_LOGFS_ReadStrided(fd, buf, count, datatype, file_ptr_type, + offset, &status, error_code); + MPIO_Completed_request_create(&fd, count*typesize, error_code, request); +} diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs_iwrite.c b/src/mpi/romio/adio/ad_logfs/ad_logfs_iwrite.c new file mode 100644 index 00000000000..3e0b548fc0e --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs_iwrite.c @@ -0,0 +1,66 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * $Id: ad_testfs_iwrite.c,v 1.3 2002/10/24 17:01:04 gropp Exp $ + * + * Copyright (C) 2001 University of Chicago. + * See COPYRIGHT notice in top-level directory. + */ + +#include "ad_logfs.h" +#include "adioi.h" +#include "layered.h" + +/** + * + * Could improve this: + * add functions to writebuf to free up memory; in + * the start function, inform writebuf that it should try to free at least + * this many bytes; in the done function, write dump it to the writebuf; + * either free or not + * + * Hmm; this is not needed; calling the progress function on the writebuf + * already tries to free mem; it would be enough to call the progress + * function in the start func + */ + + + +/* ADIOI_LOGFS_IwriteContig() + * + * Implemented by immediately calling WriteContig() + */ +void ADIOI_LOGFS_IwriteContig(ADIO_File fd, const void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Request * request, int + *error_code) +{ + ADIO_Status status; + MPI_Offset len; + MPI_Count typesize; + int myrank, nprocs; + + MPI_Type_size_x(datatype, &typesize); + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + + len = count * typesize; + ADIOI_LOGFS_WriteContig(fd, buf, count, datatype, file_ptr_type, offset, &status, error_code); + + MPIO_Completed_request_create(&fd, len, error_code, request); +} + +void ADIOI_LOGFS_IwriteStrided(ADIO_File fd, const void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Request * request, int + *error_code) +{ + ADIO_Status status; + MPI_Count typesize; + + MPI_Type_size_x(datatype, &typesize); + + ADIOI_LOGFS_WriteStrided(fd, buf, count, datatype, file_ptr_type, + offset, &status, error_code); + + MPIO_Completed_request_create(&fd, typesize * count, error_code, request); +} diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs_open.c b/src/mpi/romio/adio/ad_logfs/ad_logfs_open.c new file mode 100644 index 00000000000..372df2529f8 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs_open.c @@ -0,0 +1,32 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * $Id: ad_testfs_open.c,v 1.2 2002/10/24 17:01:04 gropp Exp $ + * + * Copyright (C) 2001 University of Chicago. + * See COPYRIGHT notice in top-level directory. + */ + +#include +#include "ad_logfs.h" +#include "adioi.h" +#include "layered.h" +#include "logfs.h" + +/* Open is only called when we are in complete control + * (e.g. not through set_view ("logfs") */ + +void ADIOI_LOGFS_Open(ADIO_File fd, int *error_code) +{ + int ret; + *error_code = MPI_SUCCESS; + + ret = logfs_activate (fd, fd->info); + if (ret == MPI_SUCCESS) { + /* need to set view so that an entry is made in the logfile describing + * our default view*/ + logfs_set_view(fd, 0, MPI_BYTE, MPI_BYTE); + } else { + *error_code = ADIOI_Err_create_code("ADIOI_LOGFS_Open", + fd->filename, ret); + } +} diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs_rdcoll.c b/src/mpi/romio/adio/ad_logfs/ad_logfs_rdcoll.c new file mode 100644 index 00000000000..8fc5bf454f7 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs_rdcoll.c @@ -0,0 +1,44 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * $Id: ad_testfs_rdcoll.c,v 1.4 2002/10/24 17:01:05 gropp Exp $ + * + * Copyright (C) 2001 University of Chicago. + * See COPYRIGHT notice in top-level directory. + */ + +#include "ad_logfs.h" +#include "adioi.h" +#include "logfs.h" +#include "layered.h" + +void ADIOI_LOGFS_ReadStridedColl(ADIO_File fd, void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status * status, int *error_code) +{ + int ret; + + *error_code = MPI_SUCCESS; + + if (ADIO_INDIVIDUAL == file_ptr_type) { + /* fp->ind is in bytes ignoring view */ + offset = fd->fp_ind - fd->disp; + offset /= fd->etype_size; + } + + /* now offset in etypes rel to displacement of view */ + /* collective read */ + ret = logfs_readdata(fd, buf, count, datatype, offset, 1, status); + + if (file_ptr_type == ADIO_INDIVIDUAL) { + int size; + int datasize; + MPI_Aint extent; + MPI_Type_extent(fd->filetype, &extent); + MPI_Type_size(fd->filetype, &size); + MPI_Type_size(datatype, &datasize); + assert(!((datasize * count) % size)); + fd->fp_ind = offset + extent * ((datasize * count) / size); + } + + *error_code = ret; +} diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs_read.c b/src/mpi/romio/adio/ad_logfs/ad_logfs_read.c new file mode 100644 index 00000000000..07b3bbe1d15 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs_read.c @@ -0,0 +1,79 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * $Id: ad_testfs_read.c,v 1.6 2002/10/24 17:01:05 gropp Exp $ + * + * Copyright (C) 2001 University of Chicago. + * See COPYRIGHT notice in top-level directory. + */ + +#include "ad_logfs.h" +#include "adioi.h" +#include "layered.h" +#include "logfs.h" + +/* here offset has been converted from etypes into bytes */ +void ADIOI_LOGFS_ReadContig(ADIO_File fd, void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status * status, int + *error_code) +{ + int myrank, nprocs, datatype_size; + int ret; + + *error_code = MPI_SUCCESS; + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + MPI_Type_size(datatype, &datatype_size); + + if (file_ptr_type == ADIO_INDIVIDUAL) + offset = fd->fp_ind; + + /* read independent */ + ret = logfs_readdata(fd, buf, count, datatype, offset, 0, status); + + /* update indiv. filepointer (== in bytes) */ + if (file_ptr_type != ADIO_EXPLICIT_OFFSET) { + offset = fd->fp_ind; + fd->fp_ind += datatype_size * count; + fd->fp_sys_posn = fd->fp_ind; + } + + *error_code = ret; +} + +void ADIOI_LOGFS_ReadStrided(ADIO_File fd, void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status * status, int + *error_code) +{ + int myrank, nprocs; + int ret; + + *error_code = MPI_SUCCESS; + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + + if (ADIO_INDIVIDUAL == file_ptr_type) { + offset = fd->fp_ind - fd->disp; + offset /= fd->etype_size; + } + + /* now offset in etypes rel to displacement of view */ + /* Read data non collective */ + ret = logfs_readdata(fd, buf, count, datatype, offset, 0, status); + + if (file_ptr_type == ADIO_INDIVIDUAL) { + int size; + int datasize; + MPI_Aint extent; + MPI_Type_extent(fd->filetype, &extent); + MPI_Type_size(fd->filetype, &size); + MPI_Type_size(datatype, &datasize); + assert(!((datasize * count) % size)); + fd->fp_ind = offset + extent * ((datasize * count) / size); + } + + *error_code = ret; +} diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs_resize.c b/src/mpi/romio/adio/ad_logfs/ad_logfs_resize.c new file mode 100644 index 00000000000..52b9cc461da --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs_resize.c @@ -0,0 +1,20 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * $Id: ad_testfs_resize.c,v 1.2 2002/10/24 17:01:05 gropp Exp $ + * + * Copyright (C) 2001 University of Chicago. + * See COPYRIGHT notice in top-level directory. + */ + +#include "ad_logfs.h" +#include "adioi.h" +#include "layered.h" +#include "logfs.h" + +void ADIOI_LOGFS_Resize(ADIO_File fd, ADIO_Offset size, int *error_code) +{ + /* resize always works */ + *error_code = MPI_SUCCESS; + + logfs_resize(fd, size); +} diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs_seek.c b/src/mpi/romio/adio/ad_logfs/ad_logfs_seek.c new file mode 100644 index 00000000000..6a58b0000cf --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs_seek.c @@ -0,0 +1,47 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * $Id: ad_testfs_seek.c,v 1.6 2004/10/25 18:46:00 robl Exp $ + * + * Copyright (C) 2001 University of Chicago. + * See COPYRIGHT notice in top-level directory. + */ + +#include "ad_logfs.h" +#include "adioi.h" +#include "adio_extern.h" +#include "layered.h" + +/* ADIOI_LOGFS_SeekIndividual() + * + * Implements SEEK_SET only (and doesn't test for whence type); all + * other types of whence must be converted before calling this. + * + * Returns an absolute offset in bytes. The offset passed into the call is in + * terms of the etype relative to the filetype, so some calculations are + * necessary. + */ +ADIO_Offset ADIOI_LOGFS_SeekIndividual(ADIO_File fd, ADIO_Offset offset, + int whence, int *error_code) +{ + int myrank, nprocs; + + *error_code = MPI_SUCCESS; + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + /* FPRINTF(stdout, "[%d/%d] ADIOI_LOGFS_SeekIndividual called on %s\n", + * myrank, nprocs, fd->filename); + */ + /* don't need to pass seek operations to the slave; + * The file positions are reset on a view switch, so we just need + * to keep track of the seek position in memory. + * + * ADIOI_Gen_SeekIndividual does this for us + */ + + + /* have ADIO_GEN_SeekIndividual update the individual filepointer + * (fd->fp_ind) */ + return ADIOI_GEN_SeekIndividual(fd, offset, whence, error_code); + +} diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs_setsh.c b/src/mpi/romio/adio/ad_logfs/ad_logfs_setsh.c new file mode 100644 index 00000000000..84460ca9440 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs_setsh.c @@ -0,0 +1,33 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * $Id: ad_testfs_setsh.c,v 1.2 2002/10/24 17:01:05 gropp Exp $ + * + * Copyright (C) 2001 University of Chicago. + * See COPYRIGHT notice in top-level directory. + */ + +#include "ad_logfs.h" +#include "adioi.h" +#include "layered.h" + +void ADIOI_LOGFS_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code) +{ + int myrank, nprocs; + void *handle; + + *error_code = MPI_SUCCESS; + + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + FPRINTF(stdout, "[%d/%d] ADIOI_LOGFS_Set_shared_fp called on %s\n", + myrank, nprocs, fd->filename); + + /* what to do here ??? */ + assert(0); + + handle = ADIOI_Layer_switch_in(fd); + ADIO_Set_shared_fp(fd, offset, error_code); + ADIOI_Layer_switch_out(fd, handle); + +} diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs_wait.c b/src/mpi/romio/adio/ad_logfs/ad_logfs_wait.c new file mode 100644 index 00000000000..b0227d095db --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs_wait.c @@ -0,0 +1,23 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * $Id: ad_testfs_wait.c,v 1.4 2002/10/24 17:01:05 gropp Exp $ + * + * Copyright (C) 2001 University of Chicago. + * See COPYRIGHT notice in top-level directory. + */ + +#include "ad_logfs.h" +#include "adioi.h" +#include "layered.h" + +void ADIOI_LOGFS_ReadComplete(ADIO_Request * request, ADIO_Status * status, int + *error_code) +{ + return; +} + +void ADIOI_LOGFS_WriteComplete(ADIO_Request * request, ADIO_Status * status, int + *error_code) +{ + return; +} diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs_wrcoll.c b/src/mpi/romio/adio/ad_logfs/ad_logfs_wrcoll.c new file mode 100644 index 00000000000..c21a00978d0 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs_wrcoll.c @@ -0,0 +1,62 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * $Id: ad_testfs_wrcoll.c,v 1.4 2002/10/24 17:01:06 gropp Exp $ + * + * Copyright (C) 2001 University of Chicago. + * See COPYRIGHT notice in top-level directory. + */ + +#include "ad_logfs.h" +#include "adioi.h" +#include "layered.h" +#include "logfs.h" +#include + +void ADIOI_LOGFS_WriteStridedColl(ADIO_File fd, const void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status * status, int *error_code) +{ + int myrank, nprocs; + + *error_code = MPI_SUCCESS; + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); +/* FPRINTF(stdout, "[%d/%d] ADIOI_LOGFS_WriteStridedColl called on %s\n", + myrank, nprocs, fd->filename); + FPRINTF(stdout, "Filetype: fd->file_system: %i", fd->file_system); */ + + if (ADIO_INDIVIDUAL == file_ptr_type) { + /* fp->ind is in bytes ignoring view */ + offset = fd->fp_ind - fd->disp; + offset /= fd->etype_size; + } + + /* now offset in etypes rel to displacement of view */ + logfs_writedata(fd, buf, count, datatype, offset, 1); + + if (file_ptr_type == ADIO_INDIVIDUAL) { + int size; + int datasize; + MPI_Aint extent; + MPI_Type_extent(fd->filetype, &extent); + MPI_Type_size(fd->filetype, &size); + MPI_Type_size(datatype, &datasize); + assert(!((datasize * count) % size)); + fd->fp_ind = offset + extent * ((datasize * count) / size); + } + +/* + handle = ADIOI_Layer_switch_in (fd); + fd->fns->ADIOI_xxx_WriteStridedColl (fd, buf,count,datatype,file_ptr_type, + offset, status, error_code); + ADIOI_Layer_switch_out(fd,handle); +*/ + if (status) { + int bufsize, size; + /* Don't set status if it isn't needed */ + MPI_Type_size(datatype, &size); + bufsize = size * count; + MPIR_Status_set_bytes(status, datatype, bufsize); + } +} diff --git a/src/mpi/romio/adio/ad_logfs/ad_logfs_write.c b/src/mpi/romio/adio/ad_logfs/ad_logfs_write.c new file mode 100644 index 00000000000..a623811d9d2 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/ad_logfs_write.c @@ -0,0 +1,102 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * $Id: ad_testfs_write.c,v 1.5 2002/10/24 17:01:06 gropp Exp $ + * + * Copyright (C) 2001 University of Chicago. + * See COPYRIGHT notice in top-level directory. + */ + +#include "ad_logfs.h" +#include "adioi.h" +#include "layered.h" +#include "logfs_file.h" +#include "logfs.h" +#include + +void ADIOI_LOGFS_WriteContig(ADIO_File fd, const void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status * status, int + *error_code) +{ + int myrank, nprocs, datatype_size; + /*int etype_extent; */ + + *error_code = MPI_SUCCESS; + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + MPI_Type_size(datatype, &datatype_size); + /* FPRINTF(stdout, "[%d/%d] ADIOI_LOGFS_WriteContig called on %s\n", myrank, + * nprocs, fd->filename); + * FPRINTF(stdout, "Filetype: fd->file_system: %i", fd->file_system); + */ + + if (file_ptr_type == ADIO_INDIVIDUAL) + offset = fd->fp_ind; + + /* convert offset into view (including disp+etype) */ + offset -= fd->disp; + offset /= fd->etype_size; + + logfs_writedata(fd, buf, count, datatype, offset, 0); + + if (file_ptr_type == ADIO_INDIVIDUAL) + fd->fp_ind += datatype_size * count; + + /* + * handle = ADIOI_Layer_switch_in (fd); + * fd->fns->ADIOI_xxx_WriteContig (fd, buf,count,datatype,file_ptr_type, + * offset,status,error_code); + * ADIOI_Layer_switch_out(fd,handle); + */ + + if (status) { + int bufsize, size; + /* Don't set status if it isn't needed */ + MPI_Type_size(datatype, &size); + bufsize = size * count; + MPIR_Status_set_bytes(status, datatype, bufsize); + } +} + +void ADIOI_LOGFS_WriteStrided(ADIO_File fd, const void *buf, int count, + MPI_Datatype datatype, int file_ptr_type, + ADIO_Offset offset, ADIO_Status * status, int *error_code) +{ + int myrank, nprocs; + + *error_code = MPI_SUCCESS; + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + /*FPRINTF(stdout, "[%d/%d] ADIOI_LOGFS_WriteStrided called on %s\n", + * myrank, nprocs, fd->filename); + * FPRINTF(stdout, "Filetype: fd->file_system: %i", fd->file_system); + */ + if (ADIO_INDIVIDUAL == file_ptr_type) { + offset = fd->fp_ind - fd->disp; + offset /= fd->etype_size; + } + + /* now offset in etypes rel to displacement of view */ + logfs_writedata(fd, buf, count, datatype, offset, 0); + + if (file_ptr_type == ADIO_INDIVIDUAL) { + int size; + int datasize; + MPI_Aint extent; + MPI_Type_extent(fd->filetype, &extent); + MPI_Type_size(fd->filetype, &size); + MPI_Type_size(datatype, &datasize); + assert(!((datasize * count) % size)); + fd->fp_ind = offset + extent * ((datasize * count) / size); + } + + if (status) { + int bufsize, size; + /* Don't set status if it isn't needed */ + MPI_Type_size(datatype, &size); + bufsize = size * count; + MPIR_Status_set_bytes(status, datatype, bufsize); + } +} diff --git a/src/mpi/romio/adio/ad_logfs/logfs-reader.c b/src/mpi/romio/adio/ad_logfs/logfs-reader.c new file mode 100644 index 00000000000..b032b5dc3f6 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/logfs-reader.c @@ -0,0 +1,210 @@ +/* here's a simple utility to read the logfs .logfs and .meta files. These + * structures are all internal to logfs and defined in ad_logfs/logfs.c, so if + * they are modified there, one will have to manually update the structures + * here. + * + * if while reading the file you get an assertion (see the 'UNKNOWN RECORD TYPE + * IN LOG ' comment below), that might be because the log file was generatd + * with (or without) magic guards */ + +#include +#include +#include + +#include + +#include + +#include + +#include +typedef struct +{ + char magic[64]; /* file magic */ + int flags; + int logfilecount; /* max. number of lock files possibly created for + this file (acros reopens) (== number of CPUs + used in open/create) */ + int epoch; /* next epoch number (used in reopen) */ + char logfilebase[255]; /* base filename for logfiles */ +} logfs_logfsfile_header; + +/* from logfs_file.c */ +/* ======================================================================= */ +/* ============= LOGFS_FILE RECORDTYPE =================================== */ +/* ======================================================================= */ + +/* + * The displacement and datatype follow the record header + */ +#define LOGFS_FILE_RECORD_VIEW 1 + +/* + * After this header kind: + * - seek position (in etypes) + * - datasize (in bytes) + * + * The datasize (in bytes) followed by the + * actual data + */ +#define LOGFS_FILE_RECORD_DATA 2 + +/* + * Followed by the epoch number + */ +#define LOGFS_FILE_RECORD_SYNC 3 + +/* + * followed by filesize (MPI_Offset) + */ +#define LOGFS_FILE_RECORD_SETSIZE 4 + +/* used for debugging; If enabled, magicstart and magicstop will + * be written before and after the recordheader */ +//#define LOGFS_FILE_RECORDMAGIC + +#define LOGFS_FILE_RECORDMAGIC_START "[magicstart] " +#define LOGFS_FILE_RECORDMAGIC_STOP "[magicstop ] " + + +/* Record struct for in the logfs metalog */ +typedef struct +{ +#ifdef LOGFS_FILE_RECORDMAGIC + char magic_start[16]; +#endif + int recordtype; + double timestamp; +#ifdef LOGFS_FILE_RECORDMAGIC + char magic_stop[16]; +#endif +} logfs_file_recordstruct; + + +/* Header that goes at the beginning of both metadata and datafiles */ +typedef struct +{ + char magic[64]; + /* don't store epoch here; otherwise we need an alreduce when we lazily + * open the logfile to inform other CPUs of the highest epoch number */ + /*int epoch; */ /* epoch number of last epoch in file */ +} logfs_file_headerstruct; + + + +char * logfs_flags_to_string(int flags) +{ + if (flags == 2) return "REPLAY"; + if (flags == 3) return "ACTIVE"; + return NULL; +} +void dump_logfs(char *filename) +{ + int fd, ret; + logfs_logfsfile_header h; + + fd = open(filename, O_RDONLY); + ret = read(fd, &h, sizeof(h)); + printf("magic: %s flags %s count %d epoch %d base |%s|\n", + h.magic, logfs_flags_to_string(h.flags), + h.logfilecount, h.epoch, h.logfilebase); + +} + +void extract_typemap(char *prefix, int fd, FILE *output) +{ + int64_t count; + int64_t *indices; + int64_t *blocklens; + int ret, i; + ret = read(fd, &count, sizeof(count)); + indices = malloc(count*sizeof(*indices)); + blocklens = malloc(count*sizeof(*blocklens)); + ret = read(fd, indices, count*sizeof(*indices)); + ret = read(fd, blocklens, count*sizeof(*blocklens)); + fprintf(output, "%s ", prefix); + for (i=0; i +#include + +#include "logfs.h" +#include "ad_logfs.h" +#include "ad_logfs_common.h" +#include "logfs_info.h" +#include "logfs_file.h" +#include "writering.h" +#include "rtree.h" +#include "adio_extern.h" +#include "typehelper.h" +#include "logfs_rtree.h" +#include "logfs_user.h" + + +#define LOGFS_LOCKFILE_MAGIC "logfs-logfsfile\n" + + +/* Define if you want logfs to sync often */ +/* #define LOGFS_DOSYNC */ + + +/* structure for having a writering write to MPI file */ +typedef struct { + /* could also add hints / mode flags here */ + char *filename; + MPI_File file; + MPI_Request writereq; + MPI_Request readreq; + int readopen; /* is file opened for reading&writing */ + int writeopen; + MPI_Status status; + unsigned int readsize; /* size of active read request */ + unsigned int writesize; /* size of active write request */ +} writering_mpi_data; + +#define LOGFS_FLAG_MODE_REPLAY 2 +#define LOGFS_FLAG_MODE_ACTIVE 3 + /* real file is opened with active + * logging */ + +typedef struct { + char magic[64]; /* file magic */ + int flags; + int logfilecount; /* max. number of lock files possibly created for + * this file (acros reopens) (== number of CPUs + * used in open/create) */ + int epoch; /* next epoch number (used in reopen) */ + char logfilebase[255]; /* base filename for logfiles */ +} logfs_logfsfile_header; + + +/* try to keep some statistics so that we can adapt + * (eg upgrading to read from read_some if we miss too much reads) */ +typedef struct { + int rtree_miss; + int rtree_hit; + int rtree_overflow; + int rtree_indep_flush; +} logfs_stats; + +struct ADIO_LOGFS_Hints { + int debug; /* output debug info */ + int readmode; /* level of read support requested by the user */ + int datablocksize; /* size of write combining buffer (data) */ + int datablockcount; /* Number of write buffers (data) */ + int metablocksize; /* size of buffer for metalog */ + int metablockcount; /* number of buffers for metalog */ + int flushblocksize; /* size of intermediate buffer when replaying log file */ + int sync; /* Don't do async write combining */ + char *logfilebase; /* Basename/dir for logfiles */ + int replay_on_close; /* replay when closing */ + int timereplay; /* output replay timing */ +}; + +struct ADIO_LOGFS_Data { + + struct ADIO_LOGFS_Hints hints; + + /* actual readmode */ + int readmode; + + /* used for writering */ + writering_handle writedata; /* for writing raw data */ + writering_handle writemeta; /* for writing metadata */ + writering_mpi_data writedata_state; /* info for writing to file */ + writering_mpi_data writemeta_state; + + logfs_file_handle logfsfile; + char logfilebase[PATH_MAX]; + char *realfilename; + + /* logfsfile (handled by CPU 0) */ + logfs_logfsfile_header logfsfileheader; + char *logfsfilename; + MPI_File logfsfilehandle; + + /* lockfile (indicates logfs file is already open) */ + char *lockfilename; + MPI_File lockfilehandle; + + MPI_Comm comm; /* comm used for file */ + int commrank; /* rank in comm */ + + ADIO_Offset filesize; /* current size of the file (or what this CPU thinks) */ + + ADIO_Offset view_disp; + MPI_Datatype view_etype; + MPI_Datatype view_ftype; + MPI_Aint view_ftype_extent; + int view_ftype_size; + int view_etype_size; + + + logfs_rtree tree; + int rtree_valid; /* if the rtree is up to date */ + int file_valid; /* If the real file is up to date */ + + logfs_stats stats; + + + /* file handles */ + MPI_File realfile_single; + MPI_File realfile_collective; + + /* User replay */ + int user_replay; + logfs_user_replay_cb user_replay_cb; + + int user_amode; +}; + +typedef struct ADIO_LOGFS_Data ADIO_LOGFS_Data; + + +/**************************************************************************/ +/**************************************************************************/ +/************************* forwards ***************************/ +/**************************************************************/ + +static void logfs_replay_buildrtree(ADIO_LOGFS_Data * data, int all); +static inline void logfs_safeprefix(const char *name, char *dest, int size); +int logfs_replay_helper(ADIO_LOGFS_Data * data, int collective); +void logfs_transfer_hints(MPI_Info source, MPI_Info dest); +static int logfs_user_replay(ADIO_LOGFS_Data * data); + +static void debuginfo(const char *str, ...); + + +static inline int checkError(int ret) +{ + char msg[MPI_MAX_ERROR_STRING]; + int resultlen; + if (ret == MPI_SUCCESS) + return 0; + + MPI_Error_string(ret, msg, &resultlen); + debuginfo("%s\n", msg); + MPL_backtrace_show(stderr); + assert(0); + return -1; +} + +static void debuginfo(const char *str, ...) +{ + va_list list; + va_start(list, str); + fprintf(stderr, "logfs: "); + vfprintf(stderr, str, list); + va_end(list); +} + +static const char *readmode2string(int readmode) +{ + switch (readmode) { + case LOGFS_READMODE_NONE: + return "readmode_none"; + case LOGFS_READMODE_SOME: + return "readmode_some"; + case LOGFS_READMODE_PHASED: + return "readmode_phased"; + case LOGFS_READMODE_FULL: + return "readmode_full"; + default: + return "(unknown readmode)"; + }; +} + +/**************************************************************************** + * writering writing to mpi files * + ****************************************************************************/ +static int logfs_writering_mpi_init(void *opsdata, int read, int write) +{ + writering_mpi_data *data = (writering_mpi_data *) opsdata; + MPI_Info info = MPI_INFO_NULL; + int flags; + + assert(data->filename); + + data->writereq = MPI_REQUEST_NULL; + data->readreq = MPI_REQUEST_NULL; + + /* open file */ + assert(data->filename); + + checkError(MPI_Info_create(&info)); + /* todo: set some beter hints: mostly_writes, mostly_reads, ... */ + checkError(MPI_Info_set(info, "access_style", "sequential")); + + flags = MPI_MODE_UNIQUE_OPEN | MPI_MODE_CREATE; + /* cannot use tighter permissions due to replay-on-close */ + flags |= MPI_MODE_RDWR; + + /* Are MPI_File functions reentrant?? */ + checkError(MPI_File_open(MPI_COMM_SELF, data->filename, flags, info, &data->file)); + + checkError(MPI_Info_free(&info)); + + data->writeopen = write; + data->readopen = read; + return 1; +} + +static int logfs_writering_mpi_done(void *opsdata) +{ + writering_mpi_data *data = (writering_mpi_data *) opsdata; + assert(data); + assert(MPI_REQUEST_NULL == data->writereq); + assert(MPI_REQUEST_NULL == data->readreq); + + checkError(MPI_File_close(&data->file)); + + return 1; +} + +static int logfs_writering_mpi_start_write(void *opsdata, WRR_OFFSET ofs, + const void *writedata, unsigned int size) +{ + writering_mpi_data *data = (writering_mpi_data *) opsdata; + + assert(data); + assert(data->writereq == MPI_REQUEST_NULL); + + checkError(MPI_File_iwrite_at(data->file, ofs, (void *) writedata, size, + MPI_BYTE, &data->writereq)); + + return 1; +} + +static int logfs_writering_mpi_test_write(void *opsdata, unsigned int *written) +{ + writering_mpi_data *data = (writering_mpi_data *) opsdata; + MPI_Status stat; + int flag; + int w = 0; + + assert(data); + assert(data->writereq != MPI_REQUEST_NULL); + + checkError(MPI_Test(&data->writereq, &flag, &stat)); + + if (flag) + checkError(MPI_Get_count(&stat, MPI_BYTE, &w)); + + *written = (unsigned int) w; + return flag; +} + +static int logfs_writering_mpi_wait_write(void *opsdata, unsigned int *written) +{ + writering_mpi_data *data = (writering_mpi_data *) opsdata; + MPI_Status stat; + int w; + + assert(data); + assert(data->writereq != MPI_REQUEST_NULL); + + checkError(MPI_Wait(&data->writereq, &stat)); + checkError(MPI_Get_count(&stat, MPI_BYTE, &w)); + *written = (unsigned int) w; + + assert(data->writereq == MPI_REQUEST_NULL); + + return 1; +} + +static int logfs_writering_mpi_flush(void *opsdata) +{ + writering_mpi_data *data = (writering_mpi_data *) opsdata; + assert(data); + + /* no flush with outstanding requests */ + assert(data->writereq == MPI_REQUEST_NULL); + assert(data->readreq == MPI_REQUEST_NULL); + +#ifdef LOGFS_DOSYNC + /* flush logfile */ + checkError(MPI_File_sync(data->file)); +#endif + + return 1; +} + +static int logfs_writering_mpi_reset(void *opsdata, WRR_OFFSET ofs) +{ + writering_mpi_data *data = (writering_mpi_data *) opsdata; + assert(data); + + /* no reset with outstanding requests */ + assert(data->writereq == MPI_REQUEST_NULL); + assert(data->readreq == MPI_REQUEST_NULL); + + /* truncate file */ + checkError(MPI_File_set_size(data->file, ofs)); + + return 1; +} + +static int logfs_writering_mpi_getsize(void *opsdata, WRR_OFFSET * ofs) +{ + writering_mpi_data *data = (writering_mpi_data *) opsdata; + MPI_Offset offset; + + assert(data); + assert(data->file != MPI_FILE_NULL); + + checkError(MPI_File_get_size(data->file, &offset)); + *ofs = offset; + + return 1; +} + + + +static int logfs_writering_mpi_start_read(void *opsdata, WRR_OFFSET ofs, void *readdata, + unsigned int size) +{ + writering_mpi_data *data = (writering_mpi_data *) opsdata; + + assert(MPI_FILE_NULL != data->file); + + assert(data->readreq == MPI_REQUEST_NULL); + + + checkError(MPI_File_iread_at(data->file, ofs, readdata, size, MPI_BYTE, &data->readreq)); + + return 1; +} + +static int logfs_writering_mpi_wait_read(void *opsdata, unsigned int *size) +{ + writering_mpi_data *data = (writering_mpi_data *) opsdata; + MPI_Status stat; + int count; + + assert(data); + assert(size); + assert(data->readreq != MPI_REQUEST_NULL); + + checkError(MPI_Wait(&data->readreq, &stat)); + assert(data->readreq == MPI_REQUEST_NULL); + + checkError(MPI_Get_elements(&stat, MPI_BYTE, &count)); + *size = count; + return count; +} + +static int logfs_writering_mpi_test_read(void *opsdata, unsigned int *size) +{ + writering_mpi_data *data = (writering_mpi_data *) opsdata; + MPI_Status stat; + int flag; + int count; + + assert(size); + assert(data->readreq != MPI_REQUEST_NULL); + + checkError(MPI_Test(&data->readreq, &flag, &stat)); + if (!flag) + return flag; + + checkError(MPI_Get_elements(&stat, MPI_BYTE, &count)); + *size = count; + return 1; +} + +/***************************************************************************/ +/***** logfs_file callbacks ************************************************/ +/***************************************************************************/ + +/* logfs file also uses a callback mechanism to perform the actual + * file I/O */ + +/* called the first time something meaningful is written to the log */ +/* don't need to do anyhting; the writebuffers are already created */ +static int logfs_file_cb_init(void *userdata) +{ + return 1; +} + +static inline writering_handle logfs_file_cb_findhandle(void *userdata, int log) +{ + ADIO_LOGFS_Data *logfs = (ADIO_LOGFS_Data *) userdata; + writering_handle handle = 0; + switch (log) { + case LOGFS_FILE_LOG_DATA: + handle = logfs->writedata; + break; + case LOGFS_FILE_LOG_META: + handle = logfs->writemeta; + break; + default: + assert(0 /* UNKNOWN WRITE TYPE */); + }; + return handle; +} + +static int logfs_file_cb_write(void *userdata, ADIO_Offset ofs, const void *data, int size, int log) +{ + // ADIO_LOGFS_Data * logfs = (ADIO_LOGFS_Data *) userdata; + writering_handle handle = logfs_file_cb_findhandle(userdata, log); + + writering_write(handle, ofs, data, size); + return 1; +} + +static int logfs_file_cb_done(void *userdata) +{ + return 1; +} + +static int logfs_file_cb_restart(void *userdata, ADIO_Offset offset, int log) +{ + //ADIO_LOGFS_Data * logfs = (ADIO_LOGFS_Data *) userdata; + writering_handle handle = logfs_file_cb_findhandle(userdata, log); + writering_reset(handle, offset); + return 1; +} + +static int logfs_file_cb_getsize(void *userdata, ADIO_Offset * ofs, int log) +{ + //ADIO_LOGFS_Data * logfs = (ADIO_LOGFS_Data *) userdata; + writering_handle handle = logfs_file_cb_findhandle(userdata, log); + writering_getsize(handle, ofs); + return 1; +} + +static int logfs_file_cbr_init(void *userdata) +{ + return 1; +} + +static int logfs_file_cbr_done(void *userdata) +{ + return 1; +} + +static int logfs_file_cbr_read(void *userdata, ADIO_Offset ofs, void *data, int size, int log) +{ + //ADIO_LOGFS_Data * logfs = (ADIO_LOGFS_Data *) userdata; + + writering_handle handle = logfs_file_cb_findhandle(userdata, log); + + + return writering_read(handle, ofs, data, size); +} + +/***************************************************************************/ +/***** Lock file ***********************************************************/ +/***************************************************************************/ + +/* check if the lockfile exists */ +static int logfs_lockfile_islocked(ADIO_LOGFS_Data * data) +{ + int ret; + MPI_File file; + + ret = MPI_File_open(MPI_COMM_SELF, data->lockfilename, MPI_MODE_RDONLY, MPI_INFO_NULL, &file); + if (ret != MPI_SUCCESS) + return 0; + + MPI_File_close(&file); + return 1; +} + +static int logfs_lockfile_lock(ADIO_LOGFS_Data * data) +{ + if (!data->commrank) { + int ret; + assert(data->lockfilehandle == MPI_FILE_NULL); + ret = MPI_File_open(MPI_COMM_SELF, data->lockfilename, + MPI_MODE_WRONLY | MPI_MODE_CREATE | MPI_MODE_EXCL | + MPI_MODE_DELETE_ON_CLOSE, MPI_INFO_NULL, &data->lockfilehandle); + if (ret != MPI_SUCCESS) + return 0; + } + checkError(MPI_Barrier(data->comm)); +#ifndef NDEBUG + assert(logfs_lockfile_islocked(data)); + checkError(MPI_Barrier(data->comm)); +#endif + return 1; +} + +static int logfs_lockfile_unlock(ADIO_LOGFS_Data * data) +{ + if (!data->commrank) { + /* delete on close should remove the file */ + checkError(MPI_File_close(&data->lockfilehandle)); + + /*checkError(MPI_File_delete (data->lockfilename, MPI_INFO_NULL)); */ + } + MPI_Barrier(data->comm); +#ifndef NDEBUG + /* Check if DELETE_ON_CLOSE works + * (of course, there is a slim chance that somebody else locked the file in + * the mean time) */ + assert(!logfs_lockfile_islocked(data)); + + /* 2nd barrier needed to avoid the one of our CPUs has locked the file + * again in the mean time (which would cause the assert above to fail) */ + MPI_Barrier(data->comm); +#endif + return 1; +} + +static void logfs_logfsfile_update(ADIO_LOGFS_Data * data) +{ + if (data->commrank) + return; + + assert(data->logfsfilehandle != MPI_FILE_NULL); + + checkError(MPI_File_write(data->logfsfilehandle, &data->logfsfileheader, + sizeof(data->logfsfileheader), MPI_BYTE, MPI_STATUS_IGNORE)); +#ifdef LOGFS_DOSYNC + checkError(MPI_File_sync(data->logfsfilehandle)); +#endif +} + + +/* Read the header from an existing logfs file */ +static int logfs_logfsfile_read(const char *filename, logfs_logfsfile_header * dest) +{ + logfs_logfsfile_header h; + MPI_File file; + int ret = 0; + + MPI_Status stat; + + ret = MPI_File_open(MPI_COMM_SELF, (char *) filename, MPI_MODE_RDONLY, MPI_INFO_NULL, &file); + + if (ret != MPI_SUCCESS) + return 0; + + /* read values and delete file */ + if (MPI_SUCCESS == MPI_File_read(file, &h, sizeof(h), MPI_BYTE, &stat)) { + int ele; + MPI_Get_elements(&stat, MPI_BYTE, &ele); + if (ele == sizeof(h) && !strncmp(h.magic, LOGFS_LOCKFILE_MAGIC, + strlen(LOGFS_LOCKFILE_MAGIC))) { + /* everything OK */ + *dest = h; + ret = 1; + } + else { + /* Couldn't read enough from the file, or the magic + * didn't work out*/ + ret = 0; + } + } + else { + /* error reading file */ + ret = 0; + } + + MPI_File_close(&file); + + return ret; +} + +/* try to open an existing logfile; + * return true if an existing logfsfile was found; + * Return false (and create a default one) otherwise + */ +static int logfs_logfsfile_create (ADIO_LOGFS_Data * data, int access_mode) +{ + int commsize; + logfs_logfsfile_header h; + int reopen = 0; + + MPI_Comm_size(data->comm, &commsize); + + /* CPU 0 tries to open the header */ + if (!data->commrank) { + + /* Create default header */ + memset(&h, 0, sizeof(h)); + strncpy (h.magic, LOGFS_LOCKFILE_MAGIC, sizeof(h.magic)); + h.flags = LOGFS_FLAG_MODE_ACTIVE; + h.logfilecount = commsize; + h.epoch = 0; + memset(&h.logfilebase[0], 0, sizeof(h.logfilebase)); + if (data->logfilebase != NULL) + strncpy (h.logfilebase, data->logfilebase, sizeof(h.logfilebase)); + + /* see if we can open an existing logfs file */ + assert(data->logfsfilehandle == MPI_FILE_NULL); + if (logfs_logfsfile_read(data->logfsfilename, &h)) { + /* Could open existing file; Load the old values */ + data->logfsfileheader = h; + + /* Increase the epoch */ + ++data->logfsfileheader.epoch; + + if (data->logfsfileheader.logfilecount != commsize) { + fprintf(stderr, "logfs: Error: Cannot use %i CPUs to " + "open logfs file %s, created on %i CPUs!\n", + commsize, data->logfsfilename, data->logfsfileheader.logfilecount); + MPI_Abort(MPI_COMM_WORLD, 1); + } + + reopen = 1; + } + else { + /* Use defaults for new file */ + data->logfsfileheader = h; + + reopen = 0; + } + + /* Open file for writing. Replay-on-close might require reading the + * header, so we cannot open write-only */ + checkError(MPI_File_open(MPI_COMM_SELF, data->logfsfilename, + MPI_MODE_CREATE|MPI_MODE_RDWR, + MPI_INFO_NULL, &data->logfsfilehandle)); + logfs_logfsfile_update(data); + } + + MPI_Bcast(&data->logfsfileheader, sizeof(data->logfsfileheader), MPI_BYTE, 0, data->comm); + + MPI_Barrier(data->comm); + return reopen; +} + + +static void logfs_logfsfile_remove(ADIO_LOGFS_Data * data) +{ + MPI_Barrier(data->comm); + + if (!data->commrank) { + assert(data->logfsfilehandle != MPI_FILE_NULL); + MPI_File_close(&data->logfsfilehandle); + MPI_File_delete(data->logfsfilename, MPI_INFO_NULL); + } + + MPI_Barrier(data->comm); +} + + +/***************************************************************************/ +/***************************************************************************/ +/***************************************************************************/ + + +/* Create lockfilename given the real filename */ +static void logfs_lockfilename(const char *filename, char *buf, int bufsize) +{ + char buf2[PATH_MAX]; + logfs_safeprefix(filename, buf2, sizeof(buf2)); + + logfs_safeprefix(filename, buf2, sizeof(buf2)); + snprintf(buf, bufsize, "%s.logfslock", buf2); +} + +/* create logfsfilename give the real filename; */ +static void logfs_logfsfilename(const char *filename, char *buf, int bufsize) +{ + char buf2[PATH_MAX]; + + logfs_safeprefix(filename, buf2, sizeof(buf2)); + snprintf(buf, bufsize, "%s.logfs", buf2); +} + +static int logfs_logfilename(const char *logfilebase, char *buf, int bufsize, + int cpunum, int logtype) +{ + snprintf (buf, bufsize, "%s.%u.%s", logfilebase, cpunum, + (logtype == LOGFS_FILE_LOG_META ? "meta" : "data")); + return 1; +} + +/* find out if this file is a logfs file */ +/* reentry problem here in ADIO functions? */ +int logfs_probe(MPI_Comm comm, const char *filename) +{ + char buf[255]; + MPI_File handle; + int rank; + int ret; + + MPI_Comm_rank(comm, &rank); + + if (!rank) { + logfs_logfsfilename(filename, buf, sizeof(buf) - 1); + + /* try to open logfsfile */ + if (MPI_SUCCESS == MPI_File_open(MPI_COMM_SELF, buf, MPI_MODE_RDONLY, + MPI_INFO_NULL, &handle)) { + MPI_File_close(&handle); + ret = 1; + } + } + + MPI_Bcast(&ret, 1, MPI_INT, 0, comm); + return ret; +} + +/* called by the MPI-IO delete function when a logfs file is detected + * Only deletes the .meta, .data, and .logfs files */ +int logfs_delete (const char * filename) +{ + char buf[255]; + MPI_File handle; + logfs_logfsfile_header header; + int i; + int err; + MPI_Status status; + + /* see if the file is locked */ + /* TODO */ + + /* open logfsfile; get logfile basename & count; remove */ + logfs_logfsfilename(filename, buf, sizeof(buf) - 1); + + err = MPI_File_open(MPI_COMM_SELF, buf, MPI_MODE_RDONLY, MPI_INFO_NULL, &handle); + + /* if the logfsfile doesn't exist, no prob; just remove the real file */ + if (err != MPI_SUCCESS) + return 0; + + MPI_File_read(handle, &header, sizeof(header), MPI_BYTE, &status); + MPI_Get_count(&status, MPI_BYTE, &err); + + MPI_File_close(&handle); + + if (err != sizeof(header) || + (strncmp(header.magic, LOGFS_LOCKFILE_MAGIC, strlen(LOGFS_LOCKFILE_MAGIC)))) { + debuginfo("MPI_File_delete: %s; no valid logfsfile found!" + "\nNot trying to delete logfsfile/logfiles\n", filename); + return 0; + } + + /* don't delete an open file! */ + /* not active for now */ +/* assert (!(header.flags & LOGFS_FLAG_MODE_ACTIVE)); */ + /* delete logfsfile */ + MPI_File_delete(buf, MPI_INFO_NULL); + + /* MPI_File_delete shouldn't be called on every CPU, + * so the one volunteer has to remove all of the logfiles + * (manot always possible); + * This will not work out if there are different logbases + * on different CPUs (or the same logbase but on different filesystems) */ + for (i = 0; i < header.logfilecount; ++i) { + /* generate logfilename and remove */ + logfs_logfilename(header.logfilebase, buf, sizeof(buf) - 1, i, LOGFS_FILE_LOG_META); + MPI_File_delete(buf, MPI_INFO_NULL); + + /* same for data log */ + logfs_logfilename(header.logfilebase, buf, sizeof(buf) - 1, i, LOGFS_FILE_LOG_DATA); + MPI_File_delete(buf, MPI_INFO_NULL); + } + return 1; +} + +/* by design, always true */ +int logfs_active(ADIO_File fd) +{ + //ADIO_LOGFS_Data * data = logfs_data (fd); + return 1; +} + +/* =================== flushing tree to disk ======================== */ +typedef struct { + MPI_File datalog; + MPI_Request readreq; + MPI_Request writereq; + MPI_Info readinfo; + MPI_Info writeinfo; + int collective; + ADIO_LOGFS_Data *logfsdata; +} logfs_flushtree_state; + +static int logfs_flush_start(void *userdata, int coll) +{ + logfs_flushtree_state *state = (logfs_flushtree_state *) userdata; + state->readreq = MPI_REQUEST_NULL; + state->writereq = MPI_REQUEST_NULL; + state->collective = coll; + MPI_Info_create(&state->readinfo); + MPI_Info_set(state->readinfo, "access_style", "read_once,sequential"); + MPI_Info_create(&state->writeinfo); + MPI_Info_set(state->writeinfo, "access_style", "write_once,sequential"); + + /* make sure datalog is flushed (since we read directly from the disk) */ + writering_flush(state->logfsdata->writedata); + + return 1; +} + +static int logfs_flush_stop(void *userdata) +{ + logfs_flushtree_state *state = (logfs_flushtree_state *) userdata; + assert(state->readreq == MPI_REQUEST_NULL); + assert (state->writereq == MPI_REQUEST_NULL); + MPI_Info_free(&state->readinfo); + MPI_Info_free(&state->writeinfo); + return 1; +} + +static int logfs_flush_readstart(void *buf, MPI_Datatype memtype, MPI_Datatype filetype, + void *userdata) +{ + //MPI_Info info; + int err; + logfs_flushtree_state *state = (logfs_flushtree_state *) userdata; + assert(state->readreq == MPI_REQUEST_NULL); + + + /* TODO: create better info (such as mostly-sequential, readonce, ...) */ + err = MPI_File_set_view(state->datalog, 0, MPI_BYTE, filetype, "native", MPI_INFO_NULL); + assert(MPI_SUCCESS == err); + err = MPI_File_iread_at(state->datalog, 0, buf, 1, memtype, &state->readreq); + assert(MPI_SUCCESS == err); + + /* restore view; we are using the same file handle as the writering is */ + err = MPI_File_set_view(state->datalog, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); + assert(MPI_SUCCESS == err); + return 1; +} + +static int logfs_flush_readwait(void *userdata) +{ + logfs_flushtree_state *state = (logfs_flushtree_state *) userdata; + MPI_Wait(&state->readreq, MPI_STATUS_IGNORE); + return 1; +} + +/* Remove logfs: prefix if any to avoid recursion */ +static inline void logfs_safeprefix(const char *name, char *dest, int size) +{ + if (strncmp(name, "logfs:", 6) != 0) + ADIOI_Strncpy(dest, name, size); + else + ADIOI_Strncpy(dest, name + 6, size); +} + +static int logfs_ensureopen (ADIO_LOGFS_Data * data, + MPI_Info info, int collective) +{ + int err; + char buf[255]; + if (collective) { + if (MPI_FILE_NULL != data->realfile_collective) + return -1; + logfs_safeprefix(data->realfilename, buf, sizeof(buf)); + err=MPI_File_open (data->comm, buf, data->user_amode, + info, &data->realfile_collective); + if (err != MPI_SUCCESS) goto fn_exit; + + } + if (MPI_FILE_NULL != data->realfile_single) + return -1; + /* think about consistency in this case; probably need to flush these + * too when the user calls sync */ + logfs_safeprefix(data->realfilename, buf, sizeof(buf)); + err = MPI_File_open(MPI_COMM_SELF, buf, + MPI_MODE_RDWR|MPI_MODE_CREATE, + info, &data->realfile_single); +fn_exit: + return err; +} + +/* for writes we assume that the memory buffer is continuous and in order + * However we need to set the filetype which is collective! */ +static int logfs_flush_writestart(void *buf, MPI_Datatype filetype, int bytes, void *userdata) +{ + logfs_flushtree_state *state = (logfs_flushtree_state *) userdata; + MPI_File handle = (state->collective ? state->logfsdata->realfile_collective : + state->logfsdata->realfile_single); + assert(state->writereq == MPI_REQUEST_NULL); + + if (MPI_FILE_NULL == handle) { + logfs_ensureopen(state->logfsdata, state->writeinfo, state->collective); + handle = (state->collective ? state->logfsdata->realfile_collective : + state->logfsdata->realfile_single); + assert(MPI_FILE_NULL != handle); + } + + checkError(MPI_File_set_view (handle, 0, MPI_BYTE, filetype, + "native", state->writeinfo)); + checkError(MPI_File_iwrite_at (handle, 0, buf, bytes, + MPI_BYTE, &state->writereq)); + + return 1; +} + +static int logfs_flush_writewait(void *userdata) +{ + logfs_flushtree_state *state = (logfs_flushtree_state *) userdata; + MPI_Wait(&state->writereq, MPI_STATUS_IGNORE); + return 1; +} + +static void logfs_flushtree(ADIO_LOGFS_Data * data, rtree_const_handle tree, int collective) +{ + ADIO_Offset filesize = 0; + int flushsize; + logfs_flushtree_state state; + logfs_rtree_flush_cb cb; + + cb.start = logfs_flush_start; + cb.stop = logfs_flush_stop; + cb.readwait = logfs_flush_readwait; + cb.readstart = logfs_flush_readstart; + cb.writestart = logfs_flush_writestart; + cb.writewait = logfs_flush_writewait; + + state.datalog = data->writedata_state.file; + state.readreq = state.writereq = MPI_REQUEST_NULL; + state.readinfo = MPI_INFO_NULL; + state.writeinfo = MPI_INFO_NULL; + state.collective = collective; + state.logfsdata = data; + + if (data->hints.flushblocksize != 0) + flushsize = data->hints.flushblocksize; + else + flushsize = 1024*1024; + + logfs_rtree_flush (&data->tree, flushsize, + &cb, &state, collective, &filesize, data->comm); +} + +int logfs_replay_helper(ADIO_LOGFS_Data * data, int collective) +{ + double start = MPI_Wtime(); + double stop; + + if (!data->user_replay) { + /* Needed in case file is open in wr-only mode, there are pending + * writes (in the background) and we want to read from the file; + * The read causes an reopen into rw-mode, but cannot do this because of + * an active write request... Flushing first avoids active writes */ + + writering_flush(data->writedata); + writering_flush(data->writemeta); + + data->rtree_valid = 0; + if (!data->rtree_valid) { + /* if there is no valid rtree, build the tree (all epochs) */ + /* (we need a full replay because we are also forced to return + * data that other CPUs wrote in older epochs...) */ + logfs_replay_buildrtree(data, 1); + } + + /* dump tree to disk (collective) */ + logfs_flushtree(data, data->tree.rtree, collective); + } + else { + logfs_user_replay(data); + } + + /* truncate datalogfiles if they aren't empty already */ + /* also empty the tree */ + if (!rtree_empty(data->tree.rtree)) { + logfs_file_clear(data->logfsfile, 0); + rtree_clear(data->tree.rtree); + data->tree.rangesize = 0; + } + + stop = MPI_Wtime(); + if (data->hints.debug || data->hints.timereplay) { + debuginfo("Replay: start,stop = %f,%f\n", start, stop); + + if ((stop - start) > 0.001) { + debuginfo("Replay (collective=%u) of %lu bytes took: %f (%f MB/s)\n", + (unsigned) collective, + (long unsigned) data->tree.rangesize, stop - start, + (double) data->tree.rangesize / ((stop - start) * 1024 * 1024)); + } + else { + debuginfo("Replay (collective=%u) of %lu bytes took: %f\n", + collective, data->tree.rangesize, stop - start); + } + } + + /* Did a full replay, so the real file is now valid */ + data->file_valid = 1; + return 0; +} + +/* called when the WHOLE file needs to be replayed; + * collective */ +int logfs_replay(ADIO_File fd, int collective) +{ + + ADIO_LOGFS_Data *data = logfs_data(fd); + + data->logfsfileheader.flags |= LOGFS_FLAG_MODE_REPLAY; + logfs_logfsfile_update(data); + + logfs_replay_helper(data, collective); + data->file_valid = 1; + + data->logfsfileheader.flags ^= ~LOGFS_FLAG_MODE_REPLAY; + logfs_logfsfile_update(data); + + return 1; +} + + +/* parse hints; get blocksize, blockcount & logfilebase */ +static inline void logfs_activate_initwritering(ADIO_File fd, + ADIO_LOGFS_Data * data, MPI_Info info, int reopen) +{ + writering_ops ops; + char buf[255]; + int read; + int write; + + /* find out if we're readonly, write only or readwrite */ + if (fd->access_mode & ADIO_WRONLY) { + read = 0; + write = 1; + } + else if (fd->access_mode & ADIO_RDONLY) { + /* we set write to 1 so that the file can truncated/removed after + * replay */ + read = 1; + write = 1; + } + else if (fd->access_mode & ADIO_RDWR) { + read = 1; + write = 1; + } + else + debuginfo("invalid RD/WR flags in logfs_activate??\n"); + + /* defaults are in hints */ + data->logfilebase[0] = 0; + + /* first check hint */ + if (data->hints.logfilebase) + ADIOI_Strncpy(data->logfilebase, data->hints.logfilebase, PATH_MAX); + + /* check environment (only if no hint is set) */ + if (getenv ("LOGFSTMP")) + ADIOI_Strncpy(data->logfilebase, getenv("LOGFSTMP"), PATH_MAX); + + /* if no logfilebase was set, and the user indicated no preference, + * put it next to the real file */ + if (data->logfilebase[0] == 0) + { + char buf[255]; + logfs_safeprefix(data->realfilename, buf, sizeof(buf)); + ADIOI_Strncpy(data->logfilebase, buf, PATH_MAX); + } + + assert(data->logfilebase); + + /* if nothing specified, choose 2 1MB for data buffers */ + if (!data->hints.datablockcount || !data->hints.datablocksize) { + data->hints.datablockcount = 2; + data->hints.datablocksize = 4 * 1024 * 1024; + } + + /* if nothing given, choose 2 64k buffers for metadata */ + if (!data->hints.metablockcount || !data->hints.metablocksize) { + data->hints.metablockcount = 2; + data->hints.metablocksize = 64 * 1024; + } + + + ops.start_write = logfs_writering_mpi_start_write; + ops.test_write = logfs_writering_mpi_test_write; + ops.wait_write = logfs_writering_mpi_wait_write; + ops.flush = logfs_writering_mpi_flush; + ops.init = logfs_writering_mpi_init; + ops.done = logfs_writering_mpi_done; + ops.getsize = logfs_writering_mpi_getsize; + ops.reset = logfs_writering_mpi_reset; + ops.start_read = logfs_writering_mpi_start_read; + ops.test_read = logfs_writering_mpi_test_read; + ops.wait_read = logfs_writering_mpi_wait_read; + + /* fill in state for datalog en metadata log files */ + logfs_logfilename(data->logfilebase, buf, sizeof(buf) - 1, data->commrank, LOGFS_FILE_LOG_DATA); + data->writedata_state.filename = ADIOI_Strdup(buf); + data->writedata_state.file = MPI_FILE_NULL; + data->writedata_state.readopen = 0; + data->writedata_state.writeopen = 0; + data->writedata_state.readreq = data->writedata_state.writereq = MPI_REQUEST_NULL; + + logfs_logfilename(data->logfilebase, buf, sizeof(buf) - 1, data->commrank, LOGFS_FILE_LOG_META); + data->writemeta_state.filename = ADIOI_Strdup(buf); + data->writemeta_state.file = MPI_FILE_NULL; + data->writemeta_state.readopen = 0; + data->writemeta_state.writeopen = 0; + data->writemeta_state.readreq = data->writemeta_state.writereq = MPI_REQUEST_NULL; + + /* create the write buffers; They do not allocate memory/open the files until the + * first read/write anyway */ + data->writedata = writering_create(data->hints.datablocksize, data->hints.datablockcount, + &ops, &data->writedata_state, read, write); + + data->writemeta = writering_create(data->hints.metablocksize, data->hints.metablockcount, + &ops, &data->writemeta_state, read, write); + + data->realfile_single = MPI_FILE_NULL; + data->realfile_collective = MPI_FILE_NULL; + + writering_setsync(data->writedata, data->hints.sync); + writering_setsync(data->writemeta, data->hints.sync); + + if (data->hints.debug) + debuginfo("init writering: meta: %i x %i bytes, data: %i x %i bytes, sync=%i\n", + data->hints.metablockcount, data->hints.metablocksize, data->hints.datablockcount, + data->hints.datablocksize, data->hints.sync); +} + +static inline void logfs_activate_initlogfs(ADIO_LOGFS_Data * data, int reopen) +{ + logfs_file_ops logfsops; + logfs_file_readops logfsreadops; + + /* init logfs_file */ + logfsops.done = logfs_file_cb_done; + logfsops.init = logfs_file_cb_init; + logfsops.write = logfs_file_cb_write; + logfsops.restart = logfs_file_cb_restart; + logfsops.getsize = logfs_file_cb_getsize; + + logfsreadops.init = logfs_file_cbr_init; + logfsreadops.read = logfs_file_cbr_read; + logfsreadops.done = logfs_file_cbr_done; + + /* if write_mostly is set, set to read_some else if read is set set + * to read else read_none (talking about access style for logfiles)*/ + data->logfsfile = logfs_file_create(data->comm, &logfsops, data, &logfsreadops, data); +} + + +static void logfs_hints_default(struct ADIO_LOGFS_Hints *h) +{ + /* hints default */ + h->debug = 0; + if (getenv("LOGFS_DEBUG")) + h->debug = 1; + h->sync = 0; + h->metablockcount = 0; + h->metablocksize = 0; + h->datablockcount = 0; + h->datablocksize = 0; + h->flushblocksize = 0; + h->readmode = LOGFS_READMODE_SOME; + h->logfilebase = 0; + h->replay_on_close = 0; + h->timereplay = 1; +} + + +/* + * can adjust: + * debug + * readmode (only upgrade between the different modes) + * + * - Can cause invalidation of tree + * - Note that some hints only take affect at open time + */ +static void logfs_process_info(struct ADIO_LOGFS_Hints *hints, MPI_Info info) +{ + char *ptr = 0; + + /* Override timereplay from env */ + if (getenv("LOGFS_TIMEREPLAY")) + hints->timereplay = 1; + + + if (info == MPI_INFO_NULL) + return; + + ad_logfs_hint_bool(info, LOGFS_INFO_DEBUG, &hints->debug); + + ad_logfs_hint_int (info, LOGFS_INFO_DATABLOCKCOUNT, + &hints->datablockcount); + ad_logfs_hint_int (info, LOGFS_INFO_DATABLOCKSIZE, + &hints->datablocksize); + ad_logfs_hint_int (info, LOGFS_INFO_METABLOCKCOUNT, + &hints->metablockcount); + ad_logfs_hint_int (info, LOGFS_INFO_METABLOCKSIZE, + &hints->metablocksize); + ad_logfs_hint_int(info, LOGFS_INFO_FLUSHBLOCKSIZE, + &hints->flushblocksize); + ad_logfs_hint_bool (info, LOGFS_INFO_SYNC, + &hints->sync); + ad_logfs_hint_str (info, LOGFS_INFO_LOGBASE, + &hints->logfilebase); + + ad_logfs_hint_bool (info, LOGFS_INFO_REPLAYCLOSE, + &hints->replay_on_close); + + ad_logfs_hint_bool (info, LOGFS_INFO_TIMEREPLAY, + &hints->timereplay); + + /* Override timereplay from env */ + if (getenv("LOGFS_TIMEREPLAY")) + hints->timereplay = 1; + + + ad_logfs_hint_str(info, LOGFS_INFO_READMODE, &ptr); + hints->readmode = 0; + if (ptr) { + if (!strcmp(ptr, "track_none")) + hints->readmode = LOGFS_READMODE_NONE; + else if (!strcmp(ptr, "track_some")) + hints->readmode = LOGFS_READMODE_SOME; + else if (!strcmp(ptr, "track_phased")) + hints->readmode = LOGFS_READMODE_PHASED; + else if (!strcmp(ptr, "track_all")) + hints->readmode = LOGFS_READMODE_FULL; + else if (hints->debug) + debuginfo("logfs: unknown read mode (%s) requested in hint " + "(%s)!\n", ptr, LOGFS_INFO_READMODE); + ADIOI_Free(ptr); + } + +} + +/* modify given hint object so that it contains all the info + * from the hints structure */ +static void logfs_store_info(struct ADIO_LOGFS_Hints *hints, MPI_Info info) +{ + const char *ptr; + assert(info != MPI_INFO_NULL); + + ad_logfs_hint_set_bool(info, LOGFS_INFO_DEBUG, hints->debug); + + switch (hints->readmode) { + case LOGFS_READMODE_NONE: + ptr = "track_none"; + break; + case LOGFS_READMODE_SOME: + ptr = "track_some"; + break; + case LOGFS_READMODE_PHASED: + ptr = "track_phased"; + break; + case LOGFS_READMODE_FULL: + ptr = "track_all"; + break; + default: + ptr = "unknown!"; + debuginfo("logfs: uknown read mode (%i) in hints->readmode\n", hints->readmode); + } + ad_logfs_hint_set_str(info, LOGFS_INFO_READMODE, ptr); + ad_logfs_hint_set_int(info, LOGFS_INFO_DATABLOCKCOUNT, hints->datablockcount); + ad_logfs_hint_set_int(info, LOGFS_INFO_DATABLOCKSIZE, hints->datablocksize); + ad_logfs_hint_set_int(info, LOGFS_INFO_METABLOCKCOUNT, hints->metablockcount); + ad_logfs_hint_set_int(info, LOGFS_INFO_METABLOCKSIZE, hints->metablocksize); + + ad_logfs_hint_set_int(info, LOGFS_INFO_FLUSHBLOCKSIZE, + hints->flushblocksize); + ad_logfs_hint_set_bool(info, LOGFS_INFO_SYNC, hints->sync); + ad_logfs_hint_set_str(info, LOGFS_INFO_LOGBASE, hints->logfilebase); + ad_logfs_hint_set_bool(info, LOGFS_INFO_REPLAYCLOSE, hints->replay_on_close); + ad_logfs_hint_set_bool(info, LOGFS_INFO_TIMEREPLAY, hints->timereplay); +} + +void logfs_transfer_hints(MPI_Info source, MPI_Info dest) +{ + struct ADIO_LOGFS_Hints h; + + logfs_hints_default(&h); + + logfs_process_info(&h, source); + logfs_store_info(&h, dest); + if (h.logfilebase) ADIOI_Free(h.logfilebase); +} + +/* update internal hints structure, also update fd->info + * so that MPI_File_get_info returns correct values */ +void logfs_setinfo(ADIO_File fd, MPI_Info info) +{ + ADIO_LOGFS_Data *data = logfs_data(fd); + logfs_process_info(&data->hints, info); + logfs_store_info(&data->hints, fd->info); +} + + +/* + * Lockfile: + * On activate, we need to: + * * Check if a lockfile exists + * * if so: file is already open -> error + * * check if a logfsfile already exists + * * if so it cannot have an ACTIVE state (this would mean the file + * is alreay open somewhere else) + * * If not, we need to load the epoch number from the header + * and append to the logfiles; Also, the number of CPUs should + * match and every CPU should be able to open its personal + * datalog&metalog + * * If the logfsfile doesn't exist, create one + * + * returns 1 if successful. If not successful, populates error_code suitable + * for handing back up to higher-level ROMIO + */ +int logfs_activate(ADIO_File fd, MPI_Info info) +{ + int error_code; + char buf[255]; + const char *prefix; + const int standalone = logfs_standalone(fd); + int locked; + int reopen = 0; + int ret = 1; + + ADIO_LOGFS_Data *data = (ADIO_LOGFS_Data *) + ADIOI_Calloc (1, sizeof(ADIO_LOGFS_Data)); + + /* set default hints */ + logfs_hints_default(&data->hints); + + /* process hints */ + logfs_process_info(&data->hints, info); + + + /* read mode: + * 0: write-only + * 1: reading allowed, but not expected (no write tracking) + * 2: full write tracking + */ + data->readmode = ((fd->access_mode & (ADIO_RDONLY | ADIO_RDWR)) ? + LOGFS_READMODE_SOME : LOGFS_READMODE_NONE); + + /* if readmode is enabled, and the user wants a different mode (selected + * through hints) adjust */ + if ((data->readmode > LOGFS_READMODE_NONE) && (data->hints.readmode > LOGFS_READMODE_NONE)) + data->readmode = data->hints.readmode; + + if (data->hints.debug) { + debuginfo("readmode is %s\n", readmode2string(data->readmode)); + } + + /* setup comm */ + MPI_Comm_dup(fd->comm, &data->comm); + MPI_Comm_rank(data->comm, &data->commrank); + + + /*====================================================== + * Determine filenames * + *======================================================*/ + + /* get real filename (= filename without logfs prefix, + * but including the native filesystem prefix (e.g. pvfs2:) */ + prefix = ADIO_FileTypeToPrefix(fd->file_system); + assert(prefix != 0); + data->realfilename = ADIOI_Malloc(strlen(prefix) + strlen(fd->filename) + 1); + sprintf(data->realfilename, "%s%s", prefix, fd->filename); + + /* basedir */ + if (data->hints.logfilebase != NULL) + ADIOI_Strncpy(data->logfilebase, data->hints.logfilebase, PATH_MAX ); + + /* store logfsfilename (including prefix!) */ + logfs_logfsfilename(data->realfilename, buf, sizeof(buf) - 1); + data->logfsfilename = ADIOI_Strdup(buf); + data->logfsfilehandle = MPI_FILE_NULL; + + /* Store lockfilename (including prefix) */ + logfs_lockfilename(data->realfilename, buf, sizeof(buf) - 1); + data->lockfilename = ADIOI_Strdup(buf); + data->lockfilehandle = MPI_FILE_NULL; + + /* Before doing anything else, see if we can lock the file */ + locked = logfs_lockfile_lock(data); + + if (!locked) { + /* No problem signaling the error/lock failure, but the cleanup + * is messy ;-) So just give up for now */ + fprintf(stderr, "File %s is already opened using logfs!\n" + "(If you're __sure__ it's not opened somewhere else," + " the lockfile (%s) might be stale; try removing it first\n", + data->realfilename, data->lockfilename); + MPI_Abort(MPI_COMM_WORLD, 1); + } + + /* See if the logfs file exists */ + reopen = logfs_logfsfile_create (data, fd->access_mode); + + if (data->hints.debug) { + if (!reopen) + debuginfo("Creating new file...\n"); + else + debuginfo("Reopening existing file...\n"); + } + + /* init logfiles */ + logfs_activate_initwritering(fd, data, info, reopen); + logfs_activate_initlogfs(data, reopen); + + /* logfsfile create reads the epoch number if the file already exists; + * pass this info to the logfs_file */ + logfs_file_setepoch(data->logfsfile, data->logfsfileheader.epoch); + + + /* we expect a call to set the view after activating logfs */ + data->view_disp = 0; + data->view_etype = MPI_DATATYPE_NULL; + data->view_ftype = MPI_DATATYPE_NULL; + + /* an empty tree is up to date */ + data->rtree_valid = 1; + + data->user_amode = fd->access_mode; + + + if (standalone) { + fd->fs_ptr = data; + + ret = logfs_ensureopen (data, MPI_INFO_NULL, 1); + if (ret != MPI_SUCCESS) { + logfs_deactivate(fd); + goto fn_exit; + } + MPI_File_get_size(data->realfile_collective, &data->filesize); + } + else { + /** get some initial state */ + if (!data->commrank) { + + ADIO_Fcntl_t f; + ADIO_Fcntl(fd, ADIO_FCNTL_GET_FSIZE, &f, &error_code); + assert(MPI_SUCCESS == error_code); + data->filesize = f.fsize; + } + MPI_Bcast(&data->filesize, 1, ADIO_OFFSET, 0, data->comm); /* init layering */ + + /* no more open required since our slave is already open */ + ADIOI_Layer_init(fd, &ADIO_LOGFS_operations, data, &error_code, 1); + } + + + /* view will be set later */ + + /* create rtree */ + data->tree.rtree = rtree_create(); + data->tree.rangesize = 0; + + /* Assume real file is not valid until the first replay */ + data->file_valid = 0; + data->user_replay = 0; +fn_exit: + + return ret; + +} + +/* Remove the logfiles of this CPU; Should only be called with the logfile + * handles closed */ +static void logfs_logfiles_remove(ADIO_LOGFS_Data * data) +{ + assert(data->writedata_state.file == MPI_FILE_NULL); + assert(data->writemeta_state.file == MPI_FILE_NULL); + + assert(data->writemeta_state.filename); + assert(data->writedata_state.filename); + + MPI_File_delete(data->writemeta_state.filename, MPI_INFO_NULL); + MPI_File_delete(data->writedata_state.filename, MPI_INFO_NULL); +} + +int logfs_deactivate(ADIO_File fd) +{ + ADIO_LOGFS_Data *data = logfs_data(fd); + int standalone = logfs_standalone(fd); + int replay = 0; + + /* File should be locked! */ + assert(data->commrank || data->lockfilehandle != MPI_FILE_NULL); + + if (data->hints.replay_on_close) + replay = 1; + + assert(logfs_active(fd)); + if (data->hints.debug) + debuginfo("Deactivating logfs (replay=%u) on %s\n", replay, fd->filename); + + /* collective replay: no need to replay if no writes */ + if (replay && !(data->user_amode & MPI_MODE_RDONLY)) + { + logfs_replay(fd, 1); + } + + + /* mark file as no longer active */ + data->logfsfileheader.flags &= ~LOGFS_FLAG_MODE_ACTIVE; + + /* if we do a replay, no need to keep the trace files */ + if (replay) + logfs_logfsfile_remove(data); + else + logfs_logfsfile_update(data); + + /* close logfs file */ + logfs_file_free(&data->logfsfile); + + /* close write buffers */ + /* writering calls writeops done (logfs_writering_mpi_done) + * which closes the file */ + writering_free(&data->writedata); + writering_free(&data->writemeta); + + /* have to keep this for last since logfs_data cannot be called anylonger + * after the layering is deactivated */ + if (!standalone) + ADIOI_Layer_done(fd); + + + + if (data->view_etype != MPI_DATATYPE_NULL) + MPI_Type_free(&data->view_etype); + if (MPI_DATATYPE_NULL != data->view_ftype) + MPI_Type_free(&data->view_ftype); + + /* free rtree */ + rtree_free(&data->tree.rtree); + data->tree.rangesize = 0; + + /* close files */ + if (data->realfile_single != MPI_FILE_NULL) + MPI_File_close(&data->realfile_single); + if (data->realfile_collective != MPI_FILE_NULL) + MPI_File_close(&data->realfile_collective); + if (data->logfsfilehandle != MPI_FILE_NULL) + MPI_File_close(&data->logfsfilehandle); + + + if (replay) { + /* since we did a full replay, remove the per-cpu logfiles */ + logfs_logfiles_remove(data); + } + + /* Finally, unlock the file */ + logfs_lockfile_unlock(data); + + /* Free strings */ + ADIOI_Free (data->writemeta_state.filename); + ADIOI_Free (data->writedata_state.filename); + ADIOI_Free(data->logfsfilename); + ADIOI_Free(data->lockfilename); + ADIOI_Free(data->realfilename); + + /* free logfilebase hint if any */ + if (data->hints.logfilebase) + ADIOI_Free(data->hints.logfilebase); + + /* Free communicator */ + MPI_Comm_free(&data->comm); + + ADIOI_Free(data); + return 1; +} + + +typedef struct { + logfs_rtree *tree; + ADIO_Offset datalogstart; +} logfs_processtypes_info; + +/* callback for processtypes that just adds the region to the tree + * The userdata should point to a logfs_processtypes_structure */ +static int logfs_processtypes_addtree(void *membuf, int size, ADIO_Offset fileofs, void *data) +{ + logfs_processtypes_info *info = data; + + logfs_rtree_addsplit(info->tree, fileofs, fileofs + size, info->datalogstart); + info->datalogstart += size; + return 1; +} + + +/* + * Look at the write datatype and calculate which parts of the file will be + * affected; Optionally update the rtree or keep track of the filesize + */ +static void logfs_trackwrite(ADIO_LOGFS_Data * data, + MPI_Datatype memtype, int count, ADIO_Offset offset, + int updatetree, int tracksize, ADIO_Offset datalogstart) +{ + ADIO_Offset lastofs; + int memtypesize; + + /* shouldn't call this if no work is to be done */ + assert(updatetree || tracksize); + + MPI_Type_size(memtype, &memtypesize); + + if (!updatetree) { + ADIO_Offset start; + /* use quicker version */ + typehelper_calcrange(data->view_etype, + data->view_ftype, data->view_disp, offset, + memtypesize * count, &start, &lastofs); + + if (data->filesize < lastofs) + data->filesize = lastofs; + + if (data->hints.debug) + debuginfo("trackwrite: lastofs=%lu, filesize=%lu\n", (long unsigned) lastofs, + (long unsigned) data->filesize); + } + else { + DatatypeHandler cb; + logfs_processtypes_info info; + info.datalogstart = datalogstart; + info.tree = &data->tree; + cb.start = 0; + cb.stop = 0; + cb.startfragment = 0; + cb.stopfragment = 0; + cb.processdata = logfs_processtypes_addtree; + + typehelper_calcaccess(data->view_etype, data->view_ftype, + data->view_disp, offset, memtypesize * count, &cb, &info); + + /* Since we update the tree, we can find the latest byte touched + * (in this operation or in an older one) in the span of the tree */ + rtree_range range; + rtree_get_range(data->tree.rtree, &range); + + if (data->filesize < range.stop) + data->filesize = range.stop; + + if (data->hints.debug) + debuginfo("trackwrite: filesize: %lu treerange: %lu-%lu\n", + (long unsigned) data->filesize, (long unsigned) range.start, + (long unsigned) range.stop); + } +} + + +/* offset is in etypes */ +int logfs_writedata(ADIO_File fd, const void *buf, + int count, MPI_Datatype memtype, ADIO_Offset ofs, int collective) +{ + ADIO_LOGFS_Data *data = logfs_data(fd); + ADIO_Offset datalogpos; + + /*ADIOI_Flatlist_node * file; */ + /*int memtypecontig; */ + int update_tree; + int track_filesize; + + /* store write in logfile; doesn't use filetype and stores offset + * relative to view displacement and etype */ + datalogpos = logfs_file_record_write(data->logfsfile, buf, count, memtype, ofs); + + /* if TRACK_FILESIZE isn't defined, only track filesize when + * reading from the file is allowed (meaning readmode level higher than + * LOGFS_READMODE_NONE); + * + * if TRACK_FILESIZE is on, always track the filesize */ +#ifdef LOGFS_TRACK_FILESIZE + track_filesize = 1; +#else + track_filesize = (data->readmode > LOGFS_READMODE_NONE ? 1 : 0); +#endif + + /* We keep an up to date tree tracking writes only for + * LOGFS_READMODE_FULL */ + update_tree = (data->readmode >= LOGFS_READMODE_FULL ? 1 : 0); + + /* if we did an untracked write, the rtree is worthless + * Free the memory if we invalidate an existing one */ + if (!update_tree) { + if (data->rtree_valid) { + /* free memory if needed */ + rtree_clear(data->tree.rtree); + data->tree.rangesize = 0; + data->rtree_valid = 0; + } + } + + if (update_tree || track_filesize) { + logfs_trackwrite(data, memtype, count, ofs, update_tree, track_filesize, datalogpos); + } + + /* Non-zero writes invalidate the real file until the + * next replay */ + if (count) + data->file_valid = 0; + + return 1; +} + +int logfs_flush(ADIO_File fd) +{ + ADIO_LOGFS_Data *data = logfs_data(fd); + + /* flush logfiles */ + logfs_file_flush(data->logfsfile); + + /* flush underlying ringbuffers + * (also flushes underlying files) */ + writering_flush(data->writedata); + writering_flush(data->writemeta); + + return 1; +} + + + +int logfs_resize(ADIO_File fd, MPI_Offset ofs) +{ + ADIO_LOGFS_Data *data = logfs_data(fd); + + logfs_file_record_setsize(data->logfsfile, ofs); + + data->filesize = ofs; + + return 1; +} + +int logfs_set_view(ADIO_File fd, MPI_Offset disp, MPI_Datatype etype, MPI_Datatype filetype) +{ + ADIO_LOGFS_Data *data = logfs_data(fd); + + /* record the old view as native since that is the only one supported */ + logfs_file_record_view(data->logfsfile, etype, filetype, disp, "native"); + + if (data->view_etype != MPI_DATATYPE_NULL) + MPI_Type_free(&data->view_etype); + if (data->view_ftype != MPI_DATATYPE_NULL) + MPI_Type_free(&data->view_ftype); + + MPI_Type_dup(etype, &data->view_etype); + MPI_Type_dup(filetype, &data->view_ftype); + data->view_disp = disp; + + /* cache some data here; We need it anyway + * (unless we're in write-only mode) */ + MPI_Type_extent(data->view_ftype, &data->view_ftype_extent); + MPI_Type_size(data->view_ftype, &data->view_ftype_size); + MPI_Type_size(data->view_etype, &data->view_etype_size); + + + return 1; +} + + +ADIO_Offset logfs_getfsize(ADIO_File fd) +{ + ADIO_LOGFS_Data *data = logfs_data(fd); + +#ifndef LOGFS_TRACK_FILESIZE + /* if track filesize is not set, it is illegal to call getfsize in wr-only + * mode */ + assert(data->readmode > LOGFS_FILE_READ_NONE); +#endif + return data->filesize; +} + + +/*=========================================================================== +//===== replay ============================================================== +//==========================================================================*/ + +typedef struct { + logfs_file_typeinfo *ftype; + logfs_file_typeinfo *etype; + int ftype_size; + int ftype_extent; + int etype_extent; + int etype_size; + int ftype_cont; + ADIO_Offset disp; + ADIO_Offset size; + int epoch; + logfs_rtree *tree; + ADIO_LOGFS_Data *data; +} logfs_replay_data; + + +static inline void logfs_replay_freetype(logfs_file_typeinfo * info) +{ + if (!info) + return; + ADIOI_Free(info->blocklens); + ADIOI_Free(info->indices); + ADIOI_Free(info); +} + + +static int logfs_replay_init(void *data) +{ + logfs_replay_data *rep = (logfs_replay_data *) data; + + if (rep->data->hints.debug) { + debuginfo("logfs_replay_init ...\n"); + } + + rep->ftype_size = 1; + rep->ftype_extent = 1; + rep->etype_size = 1; + rep->etype_extent = 1; + rep->ftype = 0; + rep->etype = 0; + rep->ftype_cont = 1; + rep->epoch = -1; + rep->disp = 0; + return 1; +} + +static int logfs_replay_start_epoch(void *data, int epoch) +{ + logfs_replay_data *rep = (logfs_replay_data *) data; + + + if (rep->data->hints.debug) { + debuginfo("Start epoch: epoch num=%lu\n", (unsigned long) epoch); + } + + rep->epoch = epoch; + return 1; +} + +static int logfs_replay_set_view(void *data, ADIO_Offset disp, + logfs_file_typeinfo * etype, logfs_file_typeinfo * ftype, + const char *datarep) +{ + logfs_replay_data *rep = (logfs_replay_data *) data; + + if (rep->data->hints.debug) { + debuginfo("replay write: set view: disp=%lu datarep=%s\n", (unsigned long) disp, datarep); + } + + + logfs_replay_freetype(rep->etype); + logfs_replay_freetype(rep->ftype); + + rep->etype = etype; + rep->ftype = ftype; + rep->disp = disp; + + rep->etype_extent = logfs_file_typeinfo_extent(etype); + rep->ftype_extent = logfs_file_typeinfo_extent(ftype); + rep->ftype_size = logfs_file_typeinfo_size(ftype); + rep->etype_size = logfs_file_typeinfo_size(etype); + rep->ftype_cont = logfs_file_typeinfo_continuous(ftype); + + return 1; +} + +static int logfs_replay_set_size(void *data, ADIO_Offset size) +{ + logfs_replay_data *rep = (logfs_replay_data *) data; + rep->size = size; + return 1; +} + + +/* TODO: check this for logfs_rtree (rangesize?) */ +/* size is in bytes, offset is location in datalog */ +static int logfs_replay_write(void *data, ADIO_Offset writeofs, int size, ADIO_Offset datalogstart) +{ + logfs_replay_data *rep = (logfs_replay_data *) data; + int ftypecount; + int ftyperemainder; + ADIO_Offset fileofs; + int i; + int j; + ADIO_Offset ofs = writeofs; + + assert(rep->epoch >= 0); + + if (rep->data->hints.debug) { + debuginfo("replay write: writeofs=%lu, size=%lu, \n"); + } + + /* we have view data (flatbuf,extent,size) in rep */ + + /* calc byte start offset */ + ofs *= rep->etype_size; + ofs += rep->disp; + + if (rep->ftype_cont) { + /* continuous case */ + ADIO_Offset start = ofs; + ADIO_Offset stop = ofs + size; + logfs_rtree_addsplit(rep->tree, start, stop, datalogstart); + return 1; + } + + + /* calculate amount in replay */ + ftypecount = size / rep->ftype_size; + ftyperemainder = size % rep->ftype_size; + + + /* we have a flatbuf rep */ + /* MPI-IO file views can be tiled */ + for (i=0; iftype->count; ++j) + { + /* fileofs: offset in canonical file. Computation was wrong for + * cases where datatype lower bound was non-zero: + * - ofs: user-provided offset of this request. we got that value + * directly from the .meta log file + * - i*rep->ftype_extent: the idiomantic way to deal with tiled file + * views + * - rep->ftype->indices[j] - rep->ftype->indices[0]: When + * indices[0] is zero (when lower bound is zero), this does nothing. + * When lower bound is non-zero this adjusts the offsets relative to + * the lower bound. However the offsets do not need adjusting! */ + + fileofs = ofs + i*rep->ftype_extent + + rep->ftype->indices[j]; + + logfs_rtree_addsplit(rep->tree, + fileofs, fileofs + rep->ftype->blocklens[j], datalogstart); + + /* datalogstart: posistion in .data file */ + datalogstart += rep->ftype->blocklens[j]; + } + } + + ofs += ftypecount * rep->ftype_extent; + + while (ftyperemainder) { + ADIO_Offset increment; + increment = (ftyperemainder > rep->ftype->blocklens[j] ? + ftyperemainder : rep->ftype->blocklens[j]); + + logfs_rtree_addsplit(rep->tree, ofs, ofs + increment, datalogstart); + + ftyperemainder -= increment; + datalogstart += increment; + } + + return 1; +} + +static int logfs_replay_done(void *data) +{ + logfs_replay_data *rep = (logfs_replay_data *) data; + + /* free etype/ftype */ + logfs_replay_freetype(rep->etype); + logfs_replay_freetype(rep->ftype); + + if (rep->data->hints.debug) { + debuginfo("replay done\n"); + /* dump tree */ + rtree_dump(rep->tree->rtree); + } + return 1; +} + + +/* replay current epoch and build rtree rep (adding to possibly existing one) */ +static void logfs_replay_buildrtree(ADIO_LOGFS_Data * data, int all) +{ + logfs_file_replayops ops; + logfs_replay_data repdata; + + ops.init = logfs_replay_init; + ops.start_epoch = logfs_replay_start_epoch; + ops.set_view = logfs_replay_set_view; + ops.set_size = logfs_replay_set_size; + ops.write = logfs_replay_write; + ops.done = logfs_replay_done; + + /* clear out old rtree if any */ + if (!(data->tree.rtree) ) return; + rtree_clear(data->tree.rtree); + data->tree.rangesize = 0; + + + repdata.tree = &data->tree; + repdata.data = data; + + logfs_file_replay(data->logfsfile, !all, &ops, &repdata); +} + + + +int logfs_sync(ADIO_File fd) +{ + ADIO_LOGFS_Data *data = logfs_data(fd); + ADIO_Offset filesize; + + if (data->hints.debug) + debuginfo("------------------- [LogFS] Sync called ---------------\n"); + + logfs_file_record_sync(data->logfsfile); + + /* when LOGFS_TRACK_FILESIZE is defined, always keep an accurate + * filesize after a sync, even when in write-only mode */ +#ifndef LOGFS_TRACK_FILESIZE + if (data->readmode > LOGFS_READ_NONE) { +#endif + MPI_Allreduce(&data->filesize, &filesize, 1, ADIO_OFFSET, MPI_MAX, data->comm); + data->filesize = filesize; +#ifndef LOGFS_TRACK_FILESIZE + } +#endif + + /* since we did a sync (causing a replay in r/w mode) our tree + * can be emptied and is valid */ + rtree_clear(data->tree.rtree); + data->tree.rangesize = 0; + data->rtree_valid = 1; + + /* sync the real file */ + if (MPI_FILE_NULL != data->realfile_single) + MPI_File_sync(data->realfile_single); + if (MPI_FILE_NULL != data->realfile_collective) + MPI_File_sync(data->realfile_collective); + return 1; +} + + +//=========================================================================== +//===== Read support ======================================================== +//=========================================================================== + +/* + * TODO: + * possible optimizations: + * 1) if the full rtree is available and up to date, use that one + * to read directly from the logfile what is available and + * read the rest from the real file + * 2) If the full rtree is not available, do a replay of the epoch + * and record only regions of interest to our current read request + * (by first converting the datatype into an rtree (put + * ADIOI_INVALID_OFFSET everywhere) and then replaying and updating + * (modified rtree_add_split) ; then read again from real file and + * logfile + */ + +/* For now, we just replay the last epoch (collective or not) + * and then read from the real file + */ +/* note: offset is in bytes, not etypes */ +int logfs_readdata(ADIO_File fd, void *buf, + int count, MPI_Datatype memtype, ADIO_Offset offset, int collective, + MPI_Status * status) +{ + ADIO_LOGFS_Data *data = logfs_data(fd); + MPI_File *file; + + + /* what does it mean to replay a log file when the file is opened read-only? + * We will have to assume the user knows what he/she asked for (hah!), and + * that something else replayed the file already (as in replay-on-close). */ + if (!data->file_valid && !(data->user_amode & MPI_MODE_RDONLY)) + { + /* If the file is not valid, replay once; + * Keep the file marked as valid until the first write operation */ + + /* Full replay */ + logfs_replay_helper(data, collective); + data->file_valid = 1; + } + + /* now we can just read from the real file */ + + /* set view */ + if (collective) + file = &data->realfile_collective; + else + file = &data->realfile_single; + + MPI_File_set_view (*file, data->view_disp, MPI_BYTE, + data->view_ftype, "native", MPI_INFO_NULL); + + if (collective) { + return MPI_File_read_at_all(*file, offset, buf, count, memtype, status); + } else { + return MPI_File_read_at(*file, offset, buf, count, memtype, status); + } +} + +/* +int logfs_readdata (ADIO_File fd, void * buf, int count, + MPI_Datatype memtype, ADIO_Offset offset, int collective) +{ + if we have a valid rtree use it find out where to read data + * if not: + * 1) build the full tree and keep it around so that + * we can also handle a next read (but junk it again on the first + * write) + * 2) build a partial tree only describing data dealing with the read + * request; this tree cannot be reused + */ + + /* use the tree to lookup every part of the mem read */ + /* generate a type for reading in the datafile, + * also generate a type for reading in the real file*/ + + /* start 2 nonblocking reads (since they could be on different filesystems + * (e.g. local disk and netwerk filesys) we want to maximize bandwidth usage) */ + + /* problem: in independent mode, cannot change the view on the real file! + * + + assert (0); + } + */ + + +/* + * Replay the log, passing all data to the user; + * Let him/her solve the consistency issue. + */ +static int logfs_user_replay(ADIO_LOGFS_Data * data) +{ + assert(data->user_replay_cb.init); + assert(data->user_replay_cb.done); + return 0; +} diff --git a/src/mpi/romio/adio/ad_logfs/logfs.h b/src/mpi/romio/adio/ad_logfs/logfs.h new file mode 100644 index 00000000000..f36a9476cdc --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/logfs.h @@ -0,0 +1,108 @@ +/**************************************************** + * High level logfs support functions * + ****************************************************/ + +#ifndef ROMIO_LOGFS_H +#define ROMIO_LOGFS_H + +#include "adio.h" +#include "layered.h" + +/* do we need to track file size even in wronly mode? */ + +/* Currently, the filesize is only consistent AFTER a sync point is reached + * (open, close, sync) + * + * see: MPI standard: + * "When applying consistency semantics, calls to MPI_FILE_SET_SIZE and + * MPI_FILE_PREALLOCATE are considered writes to the file (which conflict + * with operations that access bytes at displacements between the old and + * new file sizes), and MPI_FILE_GET_SIZE is considered a read of the file + * (which overlaps with all accesses to the file)." + */ +#define LOGFS_TRACK_FILESIZE + +/* Supported read levels + * + * NONE : no reading allowed + * SOME : lots of writes, mixed with a rare read + * PHASED: reading and writing phases distinct; + * In this mode, the tree is constructed at the first read + * and is discarded at the next write + * FULL : Full tracking; For mixed read and write workloads + */ +typedef enum { + LOGFS_READMODE_NONE = 0, + LOGFS_READMODE_SOME, + LOGFS_READMODE_PHASED, + LOGFS_READMODE_FULL +} logfs_readmode_kind; + +struct ADIO_LOGFS_Data; + +/* return true if we are in logfs: prefix mode */ +static inline int logfs_standalone(ADIO_File fd) +{ + return (fd->file_system == ADIO_LOGFS); +} + + +static inline struct ADIO_LOGFS_Data *logfs_data(ADIO_File fd) +{ + if (logfs_standalone(fd)) + return (struct ADIO_LOGFS_Data *) fd->fs_ptr; + return (struct ADIO_LOGFS_Data *) ADIOI_Layer_get_data(fd); +} + +/* check if logfs is active on the filehandle */ +int logfs_active(ADIO_File fd); + +/* force replay (sync) of fd */ +int logfs_replay(ADIO_File fd, int collective); + +/* collective; logfs init */ +int logfs_activate(ADIO_File fd, MPI_Info info); + +/* collective; logfs deactivate */ +int logfs_deactivate(ADIO_File fd); + +/* deletes log files associated with the given file */ +int logfs_delete (const char * filename); + + +/* return true if the given filename has a logfs log attached */ +int logfs_probe(MPI_Comm comm, const char *filename); + +/* store update: + * uses current view and displacement + * Only requires the offset to be in etypes relative to the displacement + */ +int logfs_writedata(ADIO_File fd, const void *buf, + int count, MPI_Datatype memtype, ADIO_Offset offset, int collective); + +int logfs_readdata(ADIO_File fd, void *buf, + int count, MPI_Datatype memtype, ADIO_Offset offset, + int collective, MPI_Status * status); + +/* flush logfiles and real file */ +int logfs_flush(ADIO_File fd); + +/* resize */ +int logfs_resize(ADIO_File fd, MPI_Offset newsize); + +/* set view */ +int logfs_set_view(ADIO_File fd, MPI_Offset disp, MPI_Datatype etype, MPI_Datatype ftype); + +/* file sync op called */ +int logfs_sync(ADIO_File fd); + +/* Adjust hints */ +void logfs_setinfo(ADIO_File fd, MPI_Info info); + +/* copy hints */ +void logfs_transfer_hints(MPI_Info source, MPI_Info dest); + +/* return current logical file size */ +ADIO_Offset logfs_getfsize(ADIO_File fd); + +#endif diff --git a/src/mpi/romio/adio/ad_logfs/logfs_file.c b/src/mpi/romio/adio/ad_logfs/logfs_file.c new file mode 100644 index 00000000000..ba8213bbdb4 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/logfs_file.c @@ -0,0 +1,738 @@ +#include +#include "adio.h" +#include "adio_extern.h" +#include "logfs_file.h" +#include "typehelper.h" + +/* ======================================================================= */ +/* ============= LOGFS_FILE RECORDTYPE =================================== */ +/* ======================================================================= */ + +/* + * The displacement and datatype follow the record header + */ +#define LOGFS_FILE_RECORD_VIEW 1 + +/* + * After this header kind: + * - seek position (in etypes) + * - datasize (in bytes) + * + * The datasize (in bytes) followed by the + * actual data + */ +#define LOGFS_FILE_RECORD_DATA 2 + +/* + * Followed by the epoch number + */ +#define LOGFS_FILE_RECORD_SYNC 3 + +/* + * followed by filesize (MPI_Offset) + */ +#define LOGFS_FILE_RECORD_SETSIZE 4 + +/* used for debugging; If enabled, magicstart and magicstop will + * be written before and after the recordheader */ +/* #define LOGFS_FILE_RECORDMAGIC */ + +#define LOGFS_FILE_RECORDMAGIC_START "[magicstart] " +#define LOGFS_FILE_RECORDMAGIC_STOP "[magicstop ] " + + +/* Record struct for in the logfs metalog */ +typedef struct { +#ifdef LOGFS_FILE_RECORDMAGIC + char magic_start[16]; +#endif + int recordtype; + double timestamp; +#ifdef LOGFS_FILE_RECORDMAGIC + char magic_stop[16]; +#endif +} logfs_file_recordstruct; + + +/* Header that goes at the beginning of both metadata and datafiles */ +typedef struct { + char magic[64]; + /* don't store epoch here; otherwise we need an alreduce when we lazily + * open the logfile to inform other CPUs of the highest epoch number */ + /*int epoch; *//* epoch number of last epoch in file */ +} logfs_file_headerstruct; + + + +/* + * Handles all logfs meta- and datafile I/O + */ +struct logfs_file_instance { + MPI_Comm comm; /* file communicator */ + + /* external functions for writing to a logfile */ + logfs_file_ops ops; + void *ops_data; + + /* external functions for reading from a logfile */ + logfs_file_readops readops; + void *readops_data; + + ADIO_Offset datalog_offset; /* offset of next write in datalog */ + ADIO_Offset metalog_offset; /* offset of next write in metalog */ + ADIO_Offset dataepoch_start; /* start offset of epoch in datalog */ + ADIO_Offset metaepoch_start; /* start offset of epoch in metalog */ + int last_epoch; + + int active; /* has there been real data yet */ + int readactive; /* if the file is open for reading */ + + /* do we need to record updates to view/size/epoch before the next data + * write? */ + int dirty_view; + int dirty_size; + int dirty_sync; + + /* file state info */ + int epoch; + ADIO_Offset filesize; + ADIO_Offset displacement; + MPI_Datatype etype; + MPI_Datatype filetype; + + /* used for converting memdata into a cont. buffer + * When switching to dataloop: think of doing partial writes (starting + * write as soon as 1 block of data is available) & writing + * directly into writering mem*/ + DatatypeHandler memdecodeops; + + /* used for replay: current read position in logfile */ + ADIO_Offset readpos; +}; + +/***************************************************************************/ +/*** forwards ************************************************************** + ***************************************************************************/ + +static int logfs_file_acceptmemdata(void *membuf, int size, ADIO_Offset fileoffset, void *data); + +static void logfs_file_writeheader(logfs_file_handle handle); + +static inline int logfs_file_read(logfs_file_handle handle, void *data, int size); + +static void logfs_file_flush_size(logfs_file_handle handle); + +static inline void logfs_file_readdatatype(logfs_file_handle handle, logfs_file_typeinfo * info); + +static inline void logfs_file_readseek(logfs_file_handle handle, ADIO_Offset pos); + +static int logfs_file_readheader(logfs_file_handle handle, logfs_file_headerstruct * h); + +/***************************************************************************/ + +logfs_file_handle logfs_file_create(MPI_Comm comm, const logfs_file_ops * ops, void *ops_data, + const logfs_file_readops * readops, void *readops_data) +{ + logfs_file_handle handle = (logfs_file_handle) + malloc(sizeof(struct logfs_file_instance)); + assert(handle); + + MPI_Comm_dup(comm, &handle->comm); + + handle->ops = *ops; + handle->ops_data = ops_data; + handle->readops = *readops; + handle->readops_data = readops_data; + + handle->active = 0; + handle->epoch = 0; + handle->datalog_offset = 0; + handle->metalog_offset = 0; + + handle->dirty_view = 0; + handle->dirty_size = 0; + handle->dirty_sync = 0; + + /* we rely on a set view after the create */ + handle->displacement = 0; + handle->filesize = 0; + handle->etype = MPI_DATATYPE_NULL; + handle->filetype = MPI_DATATYPE_NULL; + + /* callback for streaming memory datatype contents */ + handle->memdecodeops.start = 0; + handle->memdecodeops.startfragment = 0; + handle->memdecodeops.stop = 0; + handle->memdecodeops.stopfragment = 0; + handle->memdecodeops.processdata = logfs_file_acceptmemdata; + + /* reading */ + handle->readactive = 0; + handle->readpos = 0; + + return handle; +} + + +void logfs_file_free(logfs_file_handle * handle) +{ + /* flush setsize if needed */ + if ((*handle)->dirty_size) + logfs_file_flush_size(*handle); + + if ((*handle)->readactive) { + (*handle)->readops.done((*handle)->readops_data); + } + + if ((*handle)->active) { + (*handle)->ops.done((*handle)->ops_data); + } + + if ((*handle)->etype != MPI_DATATYPE_NULL) + MPI_Type_free(&(*handle)->etype); + + if ((*handle)->filetype != MPI_DATATYPE_NULL) + MPI_Type_free(&(*handle)->filetype); + + MPI_Comm_free(&(*handle)->comm); + + free(*handle); + *handle = 0; + +} + +static inline void logfs_file_openlogs(logfs_file_handle handle) +{ + /*int headerok; */ + logfs_file_headerstruct h; + + if (handle->active) + return; + + handle->active = 1; + + handle->ops.init(handle->ops_data); + + /* try to see if there is already a header */ + + /* try to read header */ + logfs_file_readseek(handle, 0); + if (logfs_file_readheader(handle, &h)) { + /* read header was ok; move to end of logfile and metafile */ + handle->ops.getsize(handle->ops_data, &handle->datalog_offset, LOGFS_FILE_LOG_DATA); + handle->ops.getsize(handle->ops_data, &handle->metalog_offset, LOGFS_FILE_LOG_META); + } + else { + /* couldn't read header: trunc file */ + /* reset filesize and write new header */ + handle->ops.restart(handle->ops_data, 0, LOGFS_FILE_LOG_META); + handle->metalog_offset = 0; + handle->ops.restart(handle->ops_data, 0, LOGFS_FILE_LOG_DATA); + handle->datalog_offset = 0; + logfs_file_writeheader(handle); + } +} + +static inline void logfs_file_write(logfs_file_handle handle, const void *data, int size, int log) +{ + if (!handle->active) + logfs_file_openlogs(handle); + + handle->ops.write(handle->ops_data, + (log == + LOGFS_FILE_LOG_DATA ? handle->datalog_offset : handle->metalog_offset), data, + size, log); + + /* keep track of offset in data log file */ + if (log == LOGFS_FILE_LOG_DATA) + handle->datalog_offset += size; + else + handle->metalog_offset += size; +} + +/* read flatlist view of datatype */ +static inline void logfs_file_readdatatype(logfs_file_handle handle, logfs_file_typeinfo * info) +{ + logfs_file_read(handle, &info->count, sizeof(info->count)); + + info->indices = ADIOI_Malloc (sizeof(*(info->indices))*info->count); + info->blocklens = ADIOI_Malloc (sizeof(*(info->blocklens))*info->count); + + logfs_file_read (handle, info->indices, sizeof(*(info->indices))*info->count) ; + logfs_file_read (handle, info->blocklens, sizeof(*(info->blocklens))*info->count); +} + +static void logfs_file_writedatatype(logfs_file_handle handle, MPI_Datatype type) +{ + ADIOI_Flatlist_node *flat_buf; + + + /* flattened code will store flattened representation as an attribute on + * both built-in contiguous types and user-derrived types */ + flat_buf = ADIOI_Flatten_and_find(type); + + assert(flat_buf); + + + /* write flattened version for now */ + /* write count, indices, blocklens */ + logfs_file_write(handle, &flat_buf->count, sizeof(flat_buf->count), LOGFS_FILE_LOG_META); + logfs_file_write(handle, &flat_buf->indices[0], + sizeof(flat_buf->indices[0]) * flat_buf->count, LOGFS_FILE_LOG_META); + logfs_file_write(handle, &flat_buf->blocklens[0], + sizeof(flat_buf->blocklens[0]) * flat_buf->count, LOGFS_FILE_LOG_META); +} + +static int logfs_file_readheader(logfs_file_handle handle, logfs_file_headerstruct * h) +{ + return (logfs_file_read(handle, h, sizeof(logfs_file_headerstruct)) + == sizeof(logfs_file_headerstruct)); +} + +static void logfs_file_writeheader(logfs_file_handle handle) +{ + logfs_file_headerstruct h; + memset(h.magic, 0, sizeof(h.magic)); + strcpy(h.magic, "logfs\n"); + + /* write a header to both files */ + logfs_file_write(handle, &h, sizeof(h), LOGFS_FILE_LOG_META); + logfs_file_write(handle, &h, sizeof(h), LOGFS_FILE_LOG_DATA); +} + +static inline void logfs_file_writerecordheader(logfs_file_handle handle, int recordtype) +{ + logfs_file_recordstruct s; + s.recordtype = recordtype; + s.timestamp = MPI_Wtime(); +#ifdef LOGFS_FILE_RECORDMAGIC + strncpy(s.magic_start, LOGFS_FILE_RECORDMAGIC_START, sizeof(s.magic_start)); + strncpy(s.magic_stop, LOGFS_FILE_RECORDMAGIC_STOP, sizeof(s.magic_stop)); +#endif + logfs_file_write(handle, &s, sizeof(s), LOGFS_FILE_LOG_META); +} + +static void logfs_file_flush_size(logfs_file_handle handle) +{ + logfs_file_writerecordheader(handle, LOGFS_FILE_RECORD_SETSIZE); + logfs_file_write(handle, &handle->filesize, sizeof(handle->filesize), LOGFS_FILE_LOG_META); + handle->dirty_size = 0; +} + + +static void logfs_file_flush_sync(logfs_file_handle handle) +{ + /* record start of new epoch in metalog & datalog */ + handle->dataepoch_start = handle->datalog_offset; + handle->metaepoch_start = handle->metalog_offset; + handle->last_epoch = handle->epoch; + + logfs_file_writerecordheader(handle, LOGFS_FILE_RECORD_SYNC); + logfs_file_write(handle, &handle->epoch, sizeof(handle->epoch), LOGFS_FILE_LOG_META); + handle->dirty_sync = 0; +} + +static void logfs_file_flush_view(logfs_file_handle handle) +{ + logfs_file_writerecordheader(handle, LOGFS_FILE_RECORD_VIEW); + logfs_file_write(handle, &handle->displacement, + sizeof(handle->displacement), LOGFS_FILE_LOG_META); + logfs_file_writedatatype(handle, handle->etype); + logfs_file_writedatatype(handle, handle->filetype); + handle->dirty_view = 0; +} + + + +void logfs_file_record_sync(logfs_file_handle handle) +{ + /*ADIO_Offset filesize; */ + + /* see if any of the others wrote a view change; if so, we need to do to + * (because of collective nature of the calls); same for set_size + * + * So, allgather the dirty bits for epoch, view and setsize; + * 1) they are all 0: -> no size/view changes since last sync -> do + * nothing + * 2) All 1: view/setsize called, but nobody wrote a record -> do nothing + * 3) Some are 1: there was a change, and some CPUs wrote a record + * -> Everybody who still has a 1 needs to flush the record + */ + /* lazy writing of the epoch */ + + /*int flags[3] = {handle->dirty_view ? 1 : 0, + * handle->dirty_size ? 1 : 0, + * handle->dirty_sync ? 1 : 0}; + * int recvbuf[3] = {0, 0, 0}; */ + + /* since the dirty bits will be collective set to 1 (because of the fact + * that the calls to set_size, sync, set_view are collective) finding a min + * of 0 when we have a 1 means somebody did write a record so we have to do + * so too */ + /*MPI_Allreduce (&flags[0], &recvbuf[0], 3, MPI_INT, MPI_MIN, handle->comm); + */ + /* so we have a zero and min is zero: everybody has zero; nothing happened + * since last sync, so no need to flush */ + /* if we have more than the min, we need to flush (means we have dirty bit + * and at least one of the others doens't) */ + + /* problem here: need to have the order right otherwise there could be a + * deadlock in the replay + * Also: dirty bit needs to be extended into count because of the + * following: + * set_size set_size + * write data (causes set_size flush) do nothing + * set size set size + * + * sync sync + * + * The sync would only cause one set size record to be written + * solution: probably need same detection logic in any of the collective + * calls (size,view,sync) which would solve the above situation by + * causing a flush on cpu B before the second set size + * + * Problem: makes these ops collective again... consider not doing + * delayed set_size/set_view/sync at all which would avoid the allreduce + * on set_view,sync and setsize */ + + /*if (flags[0] < recvbuf[0]) + * logfs_file_flush_view (handle); + * if (flags[1] < recvbuf[1]) + * logfs_file_flush_size(handle); + * if (flags[2] < recvbuf[2]) + * logfs_file_flush_sync (handle); + */ + + ++handle->epoch; + handle->dirty_sync = 1; + + +} + + +void logfs_file_record_setsize(logfs_file_handle handle, MPI_Offset size) +{ + handle->filesize = size; + if (!size) { + /* TODO do something special and erase the log */ + /* or even better: handle this on a higher level; + * this has nothing to do with the logfs_file as such*/ + handle->epoch = 0; + } + + handle->dirty_size = 1; + +} + +void logfs_file_record_view(logfs_file_handle handle, MPI_Datatype etype, + MPI_Datatype filetype, MPI_Offset displacement, const char *datarep) +{ + if (handle->etype != MPI_DATATYPE_NULL) + MPI_Type_free(&handle->etype); + if (handle->filetype != MPI_DATATYPE_NULL) + MPI_Type_free(&handle->filetype); + + MPI_Type_dup(filetype, &handle->filetype); + MPI_Type_dup(etype, &handle->etype); + + handle->displacement = displacement; + + /* lazy writing of file view */ + handle->dirty_view = 1; + +} + + +MPI_Offset logfs_file_record_write(logfs_file_handle handle, + const void * buf, int count, + MPI_Datatype memtype, MPI_Offset offset) +{ + int size; + ADIO_Offset dataoffset; + + /* this always forces a state flush */ + if (handle->dirty_view) + logfs_file_flush_view(handle); + if (handle->dirty_size) + logfs_file_flush_size(handle); + if (handle->dirty_sync) + logfs_file_flush_sync(handle); + + logfs_file_writerecordheader(handle, LOGFS_FILE_RECORD_DATA); + + /* record number of bytes we are going to write */ + MPI_Type_size(memtype, &size); + size *= count; + logfs_file_write(handle, &size, sizeof(size), LOGFS_FILE_LOG_META); + + /* store the write offset in the file */ + logfs_file_write(handle, &offset, sizeof(offset), LOGFS_FILE_LOG_META); + + /* store the offset where our data will end up in the data log file */ + dataoffset = handle->datalog_offset; + logfs_file_write(handle, &handle->datalog_offset, + sizeof(handle->datalog_offset), LOGFS_FILE_LOG_META); + + /* setup dataloop copy */ + typehelper_decodememtype(buf, count, memtype, &handle->memdecodeops, handle); + + /* check that the datafile is as we think it is */ + assert((dataoffset + size) == handle->datalog_offset); + + return dataoffset; +} + +/* function gets called from within typehelper to copy the memory datatype + * contents; Just accepts data and copies it into the logfile; + * the type was already stored anyway*/ +int logfs_file_acceptmemdata(void *membuf, int size, ADIO_Offset fileoffset, void *data) +{ + logfs_file_handle handle = (logfs_file_handle) data; + + logfs_file_write(handle, membuf, size, LOGFS_FILE_LOG_DATA); + + return 1; +} + +void logfs_file_flush(logfs_file_handle handle) +{ + /* nothing to do; we cannot flush the logfiles, that is left up to whoever + * provided us with the callbacks */ +} + +void logfs_file_clear(logfs_file_handle handle, int last) +{ + /* if nothing is written yet, don't start now */ + if (!handle->active) + return; + + if (last) { + /* if the last recorded epoch change is not equal to the current epoch, + * there hasn't been a meaningful write operation in the current epoch + * (otherwise the new epoch would have been recorded in the log) + * As such, we don't have to do anything to clear the current epoch. */ + if (handle->epoch != handle->last_epoch) + return; + + /* reset to last epoch */ + handle->ops.restart(handle->ops_data, handle->metaepoch_start, LOGFS_FILE_LOG_META); + handle->metalog_offset = handle->metaepoch_start; + + handle->ops.restart(handle->ops_data, handle->dataepoch_start, LOGFS_FILE_LOG_DATA); + handle->datalog_offset = handle->dataepoch_start; + + /* What else to do here? mark state as dirty? */ + return; + } + + /* whole file needs to be cleared + * -> we can reset the epoch counter; rewrite the header + */ + handle->epoch = 0; + + /* force rewrite of file header on next data write */ + /* we rewrite the header ourselves */ + /*handle->active = 0; */ + + /* force rewrite of state on next data write */ + handle->dirty_view = 1; + handle->dirty_size = 1; + handle->dirty_sync = 1; + + handle->ops.restart(handle->ops_data, 0, LOGFS_FILE_LOG_META); + handle->metalog_offset = 0; + + handle->ops.restart(handle->ops_data, 0, LOGFS_FILE_LOG_DATA); + handle->datalog_offset = 0; + + /* write new header */ + logfs_file_writeheader(handle); +} + +/* ======================================================================== */ +/* === Read functions (for internal use) ================================== */ +/* ======================================================================== */ +static inline void logfs_file_readseek(logfs_file_handle handle, ADIO_Offset pos) +{ + assert(pos <= handle->metalog_offset); + handle->readpos = pos; +} + +/* do read operation at current position, increase read pos */ +static inline int logfs_file_read(logfs_file_handle handle, void *data, int size) +{ + int read; + if (!handle->readactive) { + handle->readactive = 1; + handle->readops.init(handle->readops_data); + } + + read = handle->readops.read(handle->readops_data, handle->readpos, + data, size, LOGFS_FILE_LOG_META); + handle->readpos += read; + return read; +} + +/* try to read a record; return 1 if OK, 0 if EOF */ +static inline int logfs_file_readrecord(logfs_file_handle handle, logfs_file_recordstruct * header) +{ + int read = logfs_file_read(handle, header, sizeof(logfs_file_recordstruct)); + + + +#ifdef LOGFS_FILE_RECORDMAGIC + /* If we could read something, it should be a valid header */ + if (read == sizeof(logfs_file_recordstruct)) { + assert(!strncmp(header->magic_start, LOGFS_FILE_RECORDMAGIC_START, + sizeof(header->magic_start))); + assert(!strncmp(header->magic_stop, LOGFS_FILE_RECORDMAGIC_STOP, + sizeof(header->magic_stop))); + + } +#endif + + /* either EOF or we read a full record; other case means error! */ + assert(!read || sizeof(logfs_file_recordstruct) == read); + + return read; +} + + +/* ======================================================================== */ +/* ======================================================================== */ +/* ======================================================================== */ +static inline + int logfs_file_replay_processview(logfs_file_handle handle, + const logfs_file_replayops * ops, void *opsdata) +{ + ADIO_Offset displacement; + + logfs_file_typeinfo *etype = (logfs_file_typeinfo *) + ADIOI_Malloc(sizeof(logfs_file_typeinfo)); + logfs_file_typeinfo *ftype = (logfs_file_typeinfo *) + ADIOI_Malloc(sizeof(logfs_file_typeinfo)); + + logfs_file_read(handle, &displacement, sizeof(displacement)); + logfs_file_readdatatype(handle, etype); + logfs_file_readdatatype(handle, ftype); + + return ops->set_view(opsdata, displacement, etype, ftype, "native"); +} + +static inline + int logfs_file_replay_processdata(logfs_file_handle handle, + const logfs_file_replayops * ops, void *opsdata) +{ + int size; + ADIO_Offset datalogofs, fileofs; + logfs_file_read(handle, &size, sizeof(size)); + logfs_file_read(handle, &fileofs, sizeof(fileofs)); + logfs_file_read(handle, &datalogofs, sizeof(datalogofs)); + + return ops->write(opsdata, fileofs, size, datalogofs); +} + +static inline + int logfs_file_replay_processsync(logfs_file_handle handle, + const logfs_file_replayops * ops, void *opsdata) +{ + int epoch; + + logfs_file_read(handle, &epoch, sizeof(epoch)); + return ops->start_epoch(opsdata, epoch); +} + +static inline + int logfs_file_replay_processsize(logfs_file_handle handle, + const logfs_file_replayops * ops, void *opsdata) +{ + ADIO_Offset ofs; + logfs_file_read(handle, &ofs, sizeof(ofs)); + return ops->set_size(opsdata, ofs); +} + + +int logfs_file_replay(logfs_file_handle handle, int last, + const logfs_file_replayops * ops, void *opsdata) +{ + /* offset in metalog where we start replaying */ + int active = 0; /* did we find something meaningful? */ + int cont = 1; + logfs_file_recordstruct record; + + /* if nothing is written yet, don't start now */ + /* unless we reopenened a file! */ + if (!active && last) + return 1; + + if (last) { + /* if the last recorded epoch change is not equal to the current epoch, + * there hasn't been a meaningful write operation in the current epoch + * (otherwise the new epoch would have been recorded in the log) + * As such, we don't have to do anything to clear the current epoch. */ + if (handle->epoch != handle->last_epoch) + return 1; + + logfs_file_readseek(handle, handle->metaepoch_start); + } + else { + logfs_file_headerstruct h; + logfs_file_readseek(handle, 0); + /* read header; increments replaypos */ + logfs_file_readheader(handle, &h); + } + + /* loop over records + * -> read record; find out type; read rest; call callback */ + + while (cont) { + /* read record; if EOF break */ + int ok = logfs_file_readrecord(handle, &record); + + if (!ok) + break; + + /* we found something, so make sure the callbacks are ready */ + if (!active) { + active = 1; + ops->init(opsdata); + } + + /* process record; read aditional data */ + switch (record.recordtype) { + case LOGFS_FILE_RECORD_VIEW: + cont = logfs_file_replay_processview(handle, ops, opsdata); + break; + case LOGFS_FILE_RECORD_DATA: + cont = logfs_file_replay_processdata(handle, ops, opsdata); + break; + case LOGFS_FILE_RECORD_SYNC: + cont = logfs_file_replay_processsync(handle, ops, opsdata); + break; + case LOGFS_FILE_RECORD_SETSIZE: + cont = logfs_file_replay_processsize(handle, ops, opsdata); + break; + default: + assert(0 /* UNKNOWN RECORD TYPE IN LOG */); + fprintf(stderr, "skipping unknown record type in log\n"); + }; + + if (!cont) + break; + } + + if (active && ops->done) + ops->done(opsdata); + + return cont; +} + +void logfs_file_setepoch(logfs_file_handle handle, int epoch) +{ + int ep = epoch; + MPI_Bcast(&ep, 1, MPI_INT, 0, handle->comm); + assert(ep == epoch); + + handle->epoch = epoch; + handle->dirty_sync = 1; +} diff --git a/src/mpi/romio/adio/ad_logfs/logfs_file.h b/src/mpi/romio/adio/ad_logfs/logfs_file.h new file mode 100644 index 00000000000..82392261b49 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/logfs_file.h @@ -0,0 +1,170 @@ +/**************************************************************************** + * Allows reading/writing of a logfs trace log * + * + * Only deals with the logfs logfiles; + * => recording and rereading journal entries * + * * + ****************************************************************************/ + +#ifndef LOGFS_FILE_H +#define LOGFS_FILE_H + +#include "adio.h" + +#define LOGFS_FILE_LOG_DATA 1 +#define LOGFS_FILE_LOG_META 2 + + + +/* for debugging */ +#define LOGFS_FILE_DEBUG + +/* ======================================================================== */ +/* == Callbacks =========================================================== */ +/* ======================================================================== */ + +typedef struct { + /* init should prepare both files (meta + data) for writing; + * Needs to set metaofs and dataofs to filesizes */ + int (*init) (void *userdata); + + /* log will be one of LOGFS_FILE_LOG_DATA or LOGFS_FILE_LOG_META */ + int (*write) (void *userdata, ADIO_Offset ofs, const void *data, int size, int log); + + int (*done) (void *userdata); + + /* this function should truncate the specified logfile to the given offset */ + int (*restart) (void *userdata, ADIO_Offset offset, int log); + + /* return filesize; will be called before data is written when reopening + * logfiles */ + int (*getsize) (void *userdata, ADIO_Offset * offset, int log); +} logfs_file_ops; + +typedef struct { + /* function should open metadatalog for reading */ + int (*init) (void *userdata); + + /* read from file; return number of bytes read */ + int (*read) (void *userdata, ADIO_Offset offset, void *data, int size, int log); + + /* close metadata log */ + int (*done) (void *userdata); +} logfs_file_readops; + + +/* ======================================================================== */ +/* === Datatype =========================================================== */ +/* ======================================================================== */ + + +/* flatlist like structure */ +typedef struct +{ + MPI_Count count; + ADIO_Offset * blocklens; + ADIO_Offset * indices; +} logfs_file_typeinfo; + + +typedef struct { + int (*init) (void *data); + int (*start_epoch) (void *data, int epoch); + + /* user gets ownership of the ftype and datarep pointers and needs to free + * them [datarep is constant for now ]*/ + int (*set_view) (void *data, ADIO_Offset displacement, + logfs_file_typeinfo * etype, logfs_file_typeinfo * ftype, const char *datarep); + int (*set_size) (void *data, ADIO_Offset size); + int (*write) (void *data, ADIO_Offset fileofs, int size, ADIO_Offset datalogofs); + int (*done) (void *data); +} logfs_file_replayops; + + +/* ======================================================================== */ +/* == Instance type ======================================================= */ +/* ======================================================================== */ + +struct logfs_file_instance; + +typedef struct logfs_file_instance *logfs_file_handle; +typedef const struct logfs_file_instance *logfs_file_consthandle; + + +/* ======================================================================== */ +/* === Public Functions =================================================== */ +/* ======================================================================== */ + +logfs_file_handle logfs_file_create(MPI_Comm comm, + const logfs_file_ops * ops, void *ops_data, + const logfs_file_readops * readops, void *readops_data); + +/* free logfs file instance */ +void logfs_file_free(logfs_file_handle * handle); + + +/* access functions */ +/* collective! */ +void logfs_file_record_sync(logfs_file_handle handle); + +void logfs_file_record_view(logfs_file_handle, MPI_Datatype etype, + MPI_Datatype filetype, MPI_Offset displacement, const char *datarep); + + +/* offset should be the high-level MPI offset (meaning expressed in etypes and + * taking displacement of the current view into account) + * Returns start of datablock in data logfile */ +MPI_Offset logfs_file_record_write (logfs_file_handle, const void * buf, int count, + MPI_Datatype memtype, MPI_Offset offset); + +void logfs_file_record_setsize(logfs_file_handle handle, ADIO_Offset size); + +/* flush log files */ +void logfs_file_flush(logfs_file_handle handle); + +/* replay logfile: last=1: only last epoch, last=0 everything */ +/* returns 1 if all OK, 0 if one of the callbacks returned 0 */ +int logfs_file_replay(logfs_file_handle handle, int last, + const logfs_file_replayops * ops, void *opsdata); + +/* clear (truncate) logfiles; last=1: only last epoch; last=0 everything */ +void logfs_file_clear(logfs_file_handle handle, int last); + +/* set epoch number; collective and must be same on all cpus */ +void logfs_file_setepoch(logfs_file_handle handle, int epoch); + + +/* ======================================================================== */ +/* ======================================================================== */ +/* ======================================================================== */ + +/* helper function for typeinfo structures */ +static inline int logfs_file_typeinfo_extent(const logfs_file_typeinfo * info) +{ + const int last = info->count - 1; + int extent = info->indices[last] + info->blocklens[last] - info->indices[0]; + return extent; +} + +static inline int logfs_file_typeinfo_size(const logfs_file_typeinfo * info) +{ + int ret = 0; + int i; + for (i = 0; i < info->count; ++i) + ret += info->blocklens[i]; + return ret; +} + +static inline int logfs_file_typeinfo_continuous(const logfs_file_typeinfo * info) +{ + int i; + int last = info->indices[0] + info->blocklens[0]; + for (i = 1; i < info->count; ++i) { + if (info->indices[i] != last) + return 0; + last += info->blocklens[i]; + } + return 1; +} + +#endif diff --git a/src/mpi/romio/adio/ad_logfs/logfs_info.h b/src/mpi/romio/adio/ad_logfs/logfs_info.h new file mode 100644 index 00000000000..6356e5d0f59 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/logfs_info.h @@ -0,0 +1,47 @@ +#ifndef ROMIO_LOGFS_INFO_H +#define ROMIO_LGOFS_INFO_H + +/* + * Header for that groups the info keys that have an effect on a logfs + * instance + */ + +/* readmode: one of track_none, track_phased, track_all */ +#define LOGFS_INFO_READMODE "logfs_readmode" + +/* debug logfs */ +#define LOGFS_INFO_DEBUG "logfs_debug" + +/* Time replays */ +#define LOGFS_INFO_TIMEREPLAY "logfs_timereplay" + +/* NOTE: xxxBLOCKCOUNT and xxxBLOCKSIZE should both be set, + * otherwise they have no effect */ + +/* number of blocks in write buffer */ +#define LOGFS_INFO_DATABLOCKCOUNT "logfs_datablockcount" + +/* size of a block in the write buffer */ +#define LOGFS_INFO_DATABLOCKSIZE "logfs_datablocksize" + +/* number of blocks in metadata buffer */ +#define LOGFS_INFO_METABLOCKCOUNT "logfs_metablockcount" + +/* size of a block in the metadata buffer */ +#define LOGFS_INFO_METABLOCKSIZE "logfs_metablocksize" + +/* size of intermediate buffer for replaying log files */ +#define LOGFS_INFO_FLUSHBLOCKSIZE "logfs_flushblocksize" + +/* set base dir for log files */ +#define LOGFS_INFO_LOGBASE "logfs_info_logbase" + +/* for sync mode (no lazy writing) + * In sync mode, no write buffering is done + * (ignoring the blockcount/blocksize keys )*/ +#define LOGFS_INFO_SYNC "logfs_sync" + +/* replay the file when closing */ +#define LOGFS_INFO_REPLAYCLOSE "logfs_replayonclose" + +#endif diff --git a/src/mpi/romio/adio/ad_logfs/logfs_rtree.c b/src/mpi/romio/adio/ad_logfs/logfs_rtree.c new file mode 100644 index 00000000000..222c9179d91 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/logfs_rtree.c @@ -0,0 +1,490 @@ +#include + +#include "adio.h" +#include "adioi.h" +#include "logfs_rtree.h" +#include "memstack.h" +#include "growvector.h" + + + +/* struct uniquely describing rtree item */ +typedef struct { + rtree_range range; + ADIO_Offset diskstart; +} logfs_rtree_item; + + +typedef struct { + int loops; /* number of write stages we need */ + int globalloops; /* global max number of loops needed */ + int coll; /* collective mode ? */ + void *cbdata; + const logfs_rtree_flush_cb *cb; + MPI_Comm comm; + int bufsize; + void *readbuf; + void *writebuf; + int done; /* flag to see if we're done */ + ADIO_Offset filesize; + + + /* accept state */ + growvector_handle blocklens; /* size of region */ + growvector_handle indices; /* position in datafile */ + growvector_handle realpos; /* position in real file */ + ADIO_Offset writesize; /* amount of data to write so far */ + +} logfs_rtree_flush_state; + +static inline void myqsort_swap(MPI_Aint numbers1[], MPI_Count numbers2[], int s1, int s2) +{ + MPI_Count tmp; + MPI_Aint tmpa; + tmpa = numbers1[s1]; + numbers1[s1] = numbers1[s2]; + numbers1[s2] = tmpa; + tmp = numbers2[s1]; + numbers2[s1] = numbers2[s2]; + numbers2[s2] = tmp; +} + +static inline void myqsort(MPI_Aint numbers1[], MPI_Count numbers2[], int left, int right) +{ + int l_hold = left; + int r_hold = right; + MPI_Aint pivot1 = numbers1[left]; + MPI_Count pivot2 = numbers2[left]; + while (left < right) + { + while ((numbers1[right] >= pivot1) && (left < right)) + right--; + if (left != right) + myqsort_swap(numbers1, numbers2, left, right); + + while ((numbers1[left] <= pivot1) && (left < right)) + left++; + if (left != right) { + myqsort_swap(numbers1, numbers2, left, right); + right--; + } + } + numbers1[left] = pivot1; + numbers2[left] = pivot2; + pivot1 = left; + left = l_hold; + right = r_hold; + if (left < pivot1) + myqsort(numbers1, numbers2, left, pivot1 - 1); + if (right > pivot1) + myqsort(numbers1, numbers2, pivot1 + 1, right); +} + +/* takes the growvectors and build list sorted on indices so that the + * resulting list can be used to construct the filetype for the datalog read + */ +static void logfs_rtree_readtypes(logfs_rtree_flush_state *state, MPI_Datatype *memtype, + MPI_Datatype *filetype, int count, MPI_Aint *sortindices, + MPI_Count *sortblocklens) +{ + ADIO_Offset *blocklens = growvector_get_null(state->blocklens); + int i; + ADIO_Offset *tmp1 = (ADIO_Offset *) growvector_get_null(state->indices); + ADIO_Offset *tmp2 = (ADIO_Offset *) growvector_get_null(state->blocklens); + + /* TODO here: see if we can join readblocks */ + + /* convert to ints */ + for (i = 0; i < count; ++i) + { + sortindices[i] = (MPI_Aint)(*tmp1++); + sortblocklens[i] = (MPI_Count)(*tmp2++); + } + + /* Moet ook voor originele volgorde de gesorteerde index weten om memtype + * aan te maken */ + /* google translate: Must also know the sorted index for the original order + * to create memtype */ + myqsort(sortindices, sortblocklens, 0, count - 1); + ADIOI_Type_create_hindexed_x(count, sortblocklens, sortindices, MPI_BYTE, filetype); + MPI_Type_commit(filetype); + + /* now reuse sortindices to create memtype */ + /* WAARSCHIJNLIJK VERKEERD!!! Dit doet geen herordering in geheugen! */ + /* google translate: PROBABLY WRONG This does not Reordered the memory! */ + sortindices[0] = 0; + sortblocklens[0] = blocklens[0]; + for (i = 1; i < count; ++i) { + sortindices[i] = sortindices[i - 1] + blocklens[i - 1]; + sortblocklens[i] = blocklens[i]; + } + + ADIOI_Type_create_hindexed_x(count, sortblocklens, sortindices, MPI_BYTE, memtype); + MPI_Type_commit(memtype); + + +} + +/* take the lists in blocklens, indices & realpos and try to initiate a + * read operation; if the read is ready, try to start a write operation */ +static void logfs_rtree_replay_startwrite(logfs_rtree_flush_state * state) +{ + MPI_Datatype readmemtype; + MPI_Datatype readfiletype; + MPI_Datatype writefiletype; + int segmentcount = growvector_size(state->indices); + + /* temp buffers for ADIO_Offset -> int conversion */ + MPI_Aint *sortindices = (MPI_Aint *)ADIOI_Malloc(sizeof(MPI_Aint) * segmentcount); + MPI_Count *sortblocklens = (MPI_Count *)ADIOI_Malloc(sizeof(MPI_Count) * segmentcount); + + void *buffer = state->readbuf; + ADIO_Offset *tmp1; + ADIO_Offset *tmp2; + int i; + + /* create the read types: + * - a (sorted) list for reading from the datalog + * - a reordering memory type (putting data in order of the dataorder in + * the real file) + * - reuses buffers created above + **/ + + /* If we don't have anything to do, we don't have to read our logfile + * (since that one is private to our CPU) + * but we DO have to write if we are collective + */ + if (segmentcount) { + + logfs_rtree_readtypes(state, &readmemtype, &readfiletype, segmentcount, + sortindices, sortblocklens); + + /* read */ + state->cb->readstart(buffer, readmemtype, readfiletype, state->cbdata); + state->cb->readwait(state->cbdata); + + MPI_Type_free(&readmemtype); + MPI_Type_free(&readfiletype); + + /* + * if needed, we need to do 0-byte collective writes + * (but not in this function) until everybody is finished + */ + + + /* filetype for real file; need to convert to ints */ + tmp1 = (ADIO_Offset *)growvector_get_null(state->blocklens); + tmp2 = (ADIO_Offset *)growvector_get_null(state->realpos); + for (i = 0; i < segmentcount; ++i) + { + sortblocklens[i] = (MPI_Count)(*tmp1++); + sortindices[i] = (MPI_Aint)(*tmp2++); + } + + ADIOI_Type_create_hindexed_x(segmentcount, sortblocklens, sortindices, MPI_BYTE, &writefiletype); + MPI_Type_commit(&writefiletype); + + } + else { + /* create dummy write type */ + MPI_Type_contiguous(0, MPI_BYTE, &writefiletype); + MPI_Type_commit(&writefiletype); + } + + /* write */ + state->cb->writestart(buffer, writefiletype, state->writesize, state->cbdata); + state->cb->writewait(state->cbdata); + state->writesize = 0; + MPI_Type_free(&writefiletype); + + growvector_clear(state->blocklens); + growvector_clear(state->indices); + growvector_clear(state->realpos); + + /* free conversion buffers */ + ADIOI_Free(sortindices); + ADIOI_Free(sortblocklens); + + /* if we are collective, do an alreduce in the end to find the maximum + * filesize and to see if everybody is done... */ +} + +/* try to add region; Return remaining bytes if full */ +/* call with rangestart == rangestop == 0 for indicating the end of data */ +/* probably need to add extra vars here: alldone and filesize */ +static ADIO_Offset logfs_rtree_flush_add(logfs_rtree_flush_state * state, + ADIO_Offset rangestart, ADIO_Offset rangestop, + ADIO_Offset fileofs) +{ + ADIO_Offset leftover = 0; + ADIO_Offset thiswrite = rangestop - rangestart; + int forcewrite = 0; + + /* accept data until we hit the buffer size */ + /* (maybe split access) */ + /* reduce size if needed */ + if (thiswrite + state->writesize > state->bufsize) { + thiswrite = (state->bufsize - state->writesize); + leftover = rangestop - rangestart - thiswrite; + rangestop = rangestart + thiswrite; + + /* could do something smart here and decide it is not worth it to split + * up a small region into even smaller ones (e.g. splitting a 8 byte + * write into 2 regions...) */ + } + + if (thiswrite) { + /* add write to lists */ + growvector_pushback(state->blocklens, &thiswrite, sizeof(thiswrite)); + growvector_pushback(state->indices, &fileofs, sizeof(fileofs)); + growvector_pushback(state->realpos, &rangestart, sizeof(fileofs)); + state->writesize += thiswrite; + } + else { + /* thiswrite == 0, meaning the user passed rangestart == rangestop + * -> no more data, force write */ + forcewrite = 1; + } + + /* progress reads & writes */ + + if (state->writesize < state->bufsize && !forcewrite) + return leftover; + + /* OK, databuffer is full or there will be no more data + * Do write */ + logfs_rtree_replay_startwrite(state); + /* filled up this buffer: will issue a collective write operation with + * real data, so we have one less no-op write to issue */ + state->loops--; + + return leftover; +} + + +/* accept a region from the rtree; Add the data to the list of data + * to be written; If the buffer size is full, redo with the rest of the region + * */ +static int logfs_rtree_flush_accept(const rtree_range * range, ADIO_Offset * fileofs, void *extra) +{ + logfs_rtree_flush_state *state = (logfs_rtree_flush_state *) extra; + + /* all the bytes described by this node of the tree */ + ADIO_Offset todo = range->stop - range->start; + /* bytes processed by slave file system */ + ADIO_Offset done = 0; + /* bytes processed in a specific flush_add call. Large ranges will require + * multiple rounds */ + ADIO_Offset nbytes; + + /* this loop is semantically similar to that in ad_write.c, where we need to + * write N bytes but can only issue a smaller number at a time */ + while (todo) + { + /* a small range that does not fill up the buffer could be handled in + * one shot. In fact, it might take multiple tree nodes to fill up the + * buffer. large ranges, though, will require multiple rounds */ + todo = logfs_rtree_flush_add (state, range->start + done, + range->stop, (*fileofs) + done); + nbytes = range->stop - (range->start + done) - todo; + done += nbytes; + } + /* after loop exits, there is likely (unless perfectly multiple of buffer + * size) a partial write. calling code will send an "end of data" call, + * which will flush everything outstanding . */ + + return 1; +} + + +void logfs_rtree_flush(logfs_rtree * tree, int bufsize, + const logfs_rtree_flush_cb * cb, void *cbdata, int coll, + ADIO_Offset * filesize, MPI_Comm comm) +{ + logfs_rtree_flush_state state; + + if (!tree->rtree) return; + + state.coll = coll; + state.cbdata = cbdata; + state.cb = cb; + state.comm = comm; + state.bufsize = bufsize; + state.readbuf = ADIOI_Malloc(bufsize); + state.writebuf = ADIOI_Malloc(bufsize); + state.globalloops = 0; + state.done = 0; + state.filesize = *filesize; + state.writesize = 0; + + /* allocate arrays; preallocate 1k for each */ + state.blocklens = growvector_create(sizeof(ADIO_Offset), (1024 / 8)); + state.indices = growvector_create(sizeof(ADIO_Offset), (1024 / 8)); + state.realpos = growvector_create(sizeof(ADIO_Offset), (1024 / 8)); + + if (state.coll) { + int buf = bufsize; +#ifndef NDEBUG + MPI_Bcast(&buf, 1, MPI_INT, 0, comm); + assert(buf == bufsize); +#endif + } + + assert(bufsize); + + /* calculate number of write stages */ + state.loops = (tree->rangesize + bufsize - 1) / bufsize; + + if (coll) { + MPI_Allreduce(&state.loops, &state.globalloops, 1, MPI_INT, MPI_MAX, comm); + if (state.globalloops > state.loops) + state.loops = state.globalloops; + } + + /* Notify caller that we are going to start */ + state.cb->start(state.cbdata, state.coll); + + /* for now go over tree in order */ + rtree_walk(tree->rtree, logfs_rtree_flush_accept, &state); + + /* indicate end of data by writing empty region */ + logfs_rtree_flush_add(&state, 0, 0, 0); + + if (state.coll) { + while (state.loops > 0) { + /* Possible for I/o workloaod to be imbalanced across processes. + * Issue zero-byte writes until other processes have finished their + * collective writes */ + state.cb->writestart(NULL, MPI_BYTE, 0, state.cbdata); + state.cb->writewait(state.cbdata); + state.loops--; + } + } + + + /* shut down callbacks */ + state.cb->stop(state.cbdata); + + /* cleanup */ + growvector_free(&state.blocklens); + growvector_free(&state.indices); + growvector_free(&state.realpos); + + /* === */ + ADIOI_Free(state.readbuf); + ADIOI_Free(state.writebuf); + + /* update filesize */ + if (coll) + MPI_Allreduce(&state.filesize, filesize, 1, ADIO_OFFSET, MPI_MAX, comm); + else + *filesize = state.filesize; +} + + + +/* rtree callback that adds the item to the list */ +static int rtree_add_item(const rtree_range * range, ADIO_Offset * diskstart, void *extra) +{ + memstack_handle list = (memstack_handle) extra; + logfs_rtree_item *newitem = (logfs_rtree_item *) + memstack_push(list); + + /* copy item into memstack */ + newitem->diskstart = *diskstart; + newitem->range = *range; + + return 1; +} + +static inline void logfs_rtree_updatesize(ADIO_Offset * datasize, ADIO_Offset delta) +{ + if (!datasize) + return; + + if (delta < 0) { + assert(*datasize >= -delta); + } + *datasize += delta; +} + +/* TODO: merge adjacent regions if possible! + * Could be done by expanding the region by 1 on the left side (cannot + * normally merge on the right side since the datalog offsets only increase) + * If we find something, check if the datalog regions also match; if so, + * expand existing region */ +void logfs_rtree_addsplit(logfs_rtree * tree, ADIO_Offset start, ADIO_Offset + stop, ADIO_Offset diskstart) +{ + rtree_range newrange; + memstack_handle list; + int listsize; + rtree_handle rtree = tree->rtree; + ADIO_Offset *datasize = &tree->rangesize; + + newrange.start = start; + newrange.stop = stop; + + list = memstack_create(sizeof(logfs_rtree_item)); + +#if 0 + if (newrange.start > 0) + --newrange.start; +#endif + + /* find regions in tree overlapping our new region */ + rtree_overlap(rtree, &newrange, rtree_add_item, list); + + /* process the search results from the list, removing the region from the + * tree if it is completely inside the new region, and splitting it + * otherwise */ + listsize = memstack_getsize(list); + while (listsize--) { + logfs_rtree_item *cur = (logfs_rtree_item *) memstack_pop(list); + assert(cur); + + /* In any case the region needs to be removed */ + rtree_remove(rtree, &cur->range, 0); + logfs_rtree_updatesize(datasize, -(cur->range.stop - cur->range.start)); + + /* find out if it is completely inside; if so skip it */ + if (cur->range.start >= newrange.start && cur->range.stop <= newrange.stop) + continue; + + /* need to split the region */ + /* calculate split on the left */ + if (cur->range.start < newrange.start) { + rtree_range new; + new.start = cur->range.start; + new.stop = newrange.start; + assert(new.start < new.stop); + + /* add left part */ + rtree_add(rtree, &new, cur->diskstart); + logfs_rtree_updatesize(datasize, new.stop - new.start); + } + if (cur->range.stop > newrange.stop) { + rtree_range new; + new.start = newrange.stop; + new.stop = cur->range.stop; + assert(new.start < new.stop); + + /* add right part, shifting diskstart + * (except when the diskstart is invalid)*/ + rtree_add(rtree, &new, + (cur->diskstart = ADIO_OFFSET_INVALID ? + ADIO_OFFSET_INVALID : cur->diskstart + (new.start - cur->range.start))); + logfs_rtree_updatesize(datasize, new.stop - new.start); + } + + /* + * IDEA: could also record 'free' items in logfile; could be reused but + * will cause disk seeking! + */ + } + + /* add the region */ + rtree_add(rtree, &newrange, diskstart); + logfs_rtree_updatesize(datasize, newrange.stop - newrange.start); + memstack_free(&list); +} diff --git a/src/mpi/romio/adio/ad_logfs/logfs_rtree.h b/src/mpi/romio/adio/ad_logfs/logfs_rtree.h new file mode 100644 index 00000000000..2a3a39bbe58 --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/logfs_rtree.h @@ -0,0 +1,52 @@ +/***************************************************************** + * + * Helper functions for dealing with types and rtrees + * + *****************************************************************/ + +#ifndef LOGFS_RTREE_H +#define LOGFS_RTREE_H + +#include "rtree.h" +#include "adio.h" + + +/* indicate invalid offset in tree */ +#define ADIO_OFFSET_INVALID ((ADIO_Offset) -1) + +typedef struct { + rtree_handle rtree; + ADIO_Offset rangesize; /* sum of all ranges in tree */ +} logfs_rtree; + +/* add a continuous disk region to the rtree, splitting/removing existing + * regions if needed */ +void logfs_rtree_addsplit(logfs_rtree * tree, + ADIO_Offset start, ADIO_Offset stop, ADIO_Offset diskstart); + + +void logfs_rtree_type2tree(rtree_handle rtree, ADIO_Offset disp, MPI_Datatype + filetype, ADIO_Offset bytes); + + + +typedef struct { + int (*start) (void *data, int collective); + int (*readstart) (void *buf, MPI_Datatype memtype, MPI_Datatype filetype, void *userdata); + int (*readwait) (void *userdata); + int (*writestart) (void *buf, MPI_Datatype type, int bytes, void *userdata); + int (*writewait) (void *userdata); + int (*stop) (void *data); +} logfs_rtree_flush_cb; + +/* dump rtree to disk */ +/* TODO: in collective mode, could make this smarter and try to have as + * much write overlap when writing to the real file as possible + * (or maybe just the opposite) + * + * If collective, bufsize has to be equal on all cpus */ +void logfs_rtree_flush(logfs_rtree * tree, int bufsize, + const logfs_rtree_flush_cb * cb, void *cbdata, int coll, + ADIO_Offset * filesize, MPI_Comm comm); + +#endif diff --git a/src/mpi/romio/adio/ad_logfs/logfs_user.h b/src/mpi/romio/adio/ad_logfs/logfs_user.h new file mode 100644 index 00000000000..a7efa5e18be --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/logfs_user.h @@ -0,0 +1,14 @@ +/* + * Public Header file supporting user (i.e. not inside MPI/ROMIO) replay of + * logfs files + */ + +#ifndef LOGFS_USER_H +#define LOGFS_USER_H + +typedef struct { + int (*init) (); + int (*done) (); +} logfs_user_replay_cb; + +#endif diff --git a/src/mpi/romio/adio/ad_logfs/rtree_config.h b/src/mpi/romio/adio/ad_logfs/rtree_config.h new file mode 100644 index 00000000000..1b3e4203adc --- /dev/null +++ b/src/mpi/romio/adio/ad_logfs/rtree_config.h @@ -0,0 +1,10 @@ +#ifndef RTREE_CONFIG_H +#define RTREE_CONFIG_H + +#include "adio.h" + +#define RTREE_RANGE_TYPE ADIO_Offset +#define RTREE_DATA_TYPE ADIO_Offset +#define RTREE_DATA_TYPE_PRINTF "%llu" +#define RTREE_RANGE_TYPE_PRINTF "%llu" +#endif diff --git a/src/mpi/romio/adio/ad_lustre/ad_lustre_rwcontig.c b/src/mpi/romio/adio/ad_lustre/ad_lustre_rwcontig.c index 4b8bd4e97dc..4f84495f1c6 100644 --- a/src/mpi/romio/adio/ad_lustre/ad_lustre_rwcontig.c +++ b/src/mpi/romio/adio/ad_lustre/ad_lustre_rwcontig.c @@ -140,7 +140,7 @@ static void ADIOI_LUSTRE_IOContig(ADIO_File fd, const void *buf, int count, ADIO_Offset offset, ADIO_Status *status, int io_mode, int *error_code) { - ssize_t err=-1; + ssize_t err=-2; size_t rw_count; ADIO_Offset bytes_xfered=0; MPI_Count datatype_size, len; diff --git a/src/mpi/romio/adio/ad_pvfs2/ad_pvfs2_open.c b/src/mpi/romio/adio/ad_pvfs2/ad_pvfs2_open.c index c5d933f42b5..ef405aed4da 100644 --- a/src/mpi/romio/adio/ad_pvfs2/ad_pvfs2_open.c +++ b/src/mpi/romio/adio/ad_pvfs2/ad_pvfs2_open.c @@ -111,7 +111,7 @@ static void fake_an_open(PVFS_fs_id fs_id, char *pvfs_name, int access_mode, } o_status->object_ref = resp_create.ref; } else { - FPRINTF(stderr, "cannot create file without MPI_MODE_CREATE\n"); + /*FPRINTF(stderr, "cannot create file without MPI_MODE_CREATE\n");*/ o_status->error = ret; return; } diff --git a/src/mpi/romio/adio/common/Makefile.mk b/src/mpi/romio/adio/common/Makefile.mk index a06f0590a7f..d8586d6f052 100644 --- a/src/mpi/romio/adio/common/Makefile.mk +++ b/src/mpi/romio/adio/common/Makefile.mk @@ -73,5 +73,10 @@ romio_other_sources += \ adio/common/p2p_aggregation.c \ adio/common/onesided_aggregation.c \ adio/common/ad_tuning.c \ - adio/common/utils.c - + adio/common/utils.c\ + adio/common/growvector.c\ + adio/common/layered.c \ + adio/common/memstack.c\ + adio/common/typehelper.c\ + adio/common/writering.c \ + adio/common/rtree.c diff --git a/src/mpi/romio/adio/common/ad_close.c b/src/mpi/romio/adio/common/ad_close.c index 987dbe07e8f..ee7a0282cf8 100644 --- a/src/mpi/romio/adio/common/ad_close.c +++ b/src/mpi/romio/adio/common/ad_close.c @@ -90,7 +90,7 @@ void ADIO_Close(ADIO_File fd, int *error_code) ADIOI_Free(fd->file_realm_types); } ADIOI_Free(fd->hints); - + ADIOI_Free(fd->fns); MPI_Comm_free(&(fd->comm)); diff --git a/src/mpi/romio/adio/common/ad_fstype.c b/src/mpi/romio/adio/common/ad_fstype.c index 94451c6c29f..0a72ef54987 100644 --- a/src/mpi/romio/adio/common/ad_fstype.c +++ b/src/mpi/romio/adio/common/ad_fstype.c @@ -118,9 +118,75 @@ static void ADIO_FileSysType_parentdir(const char *filename, char **dirnamep); #endif #endif static void ADIO_FileSysType_prefix(const char *filename, int *fstype, + ADIOI_Fns **ops, int *error_code); static void ADIO_FileSysType_fncall(const char *filename, int *fstype, int *error_code); +struct ADIO_FSTypes +{ + ADIOI_Fns * fileops; /* function table */ + int fstype; /* ADIO_xxx constant */ + const char * prefix; /* file prefix */ +}; + +/* + * To add an ADIO + * - add to the table below + * - add a constant for your ADIO in include/adio.h + * - add a guarded include in include/adioi_fs_proto.h + */ +static struct ADIO_FSTypes fstypes[] = { +#ifdef ROMIO_UFS + { &ADIO_UFS_operations, ADIO_UFS, "ufs:"}, +#endif +#ifdef ROMIO_NFS + { &ADIO_NFS_operations, ADIO_NFS, "nfs:"}, +#endif +#ifdef ROMIO_XFS + { &ADIO_XFS_operations, ADIO_XFS, "xfs:"}, +#endif +#ifdef ROMIO_PVFS2 + { &ADIO_PVFS2_operations, ADIO_PVFS2, "pvfs2:"}, +#endif +#ifdef ROMIO_TESTFS + { &ADIO_TESTFS_operations, ADIO_TESTFS, "testfs:"}, +#endif +#ifdef ROMIO_LOGFS + { &ADIO_LOGFS_operations, ADIO_LOGFS, "logfs:"}, +#endif +#ifdef ROMIO_GPFS + { &ADIO_GPFS_operations, ADIO_GPFS, "gpfs:"}, +#endif +#ifdef ROMIO_LUSTRE + { &ADIO_LUSTRE_operations, ADIO_LUSTRE, "lustre:"}, +#endif + +#if 0 + /* cannot force icache / trace since there is no way to specify + * the underlying filesystem */ +#ifdef ROMIO_TRACE + { &ADIO_TRACE_operations, ADIO_TRACE, "trace:", "trace"}, +#endif +#ifdef ROMIO_ICACHE + { &ADIO_ICACHE_operations, ADIO_ICACHE, "icache:", "icache"}, +#endif +#endif + { 0, 0, 0} /* guard entry */ +}; + +/* return a pointer to the filesystem prefix + * Return 0 if not found */ +const char * ADIO_FileTypeToPrefix (int filetype) +{ + int i=0; + while (fstypes[i].fileops) + { + if (fstypes[i].fstype == filetype) + return fstypes[i].prefix; + ++i; + } + return 0; +} /* ADIO_FileSysType_parentdir - determines a string pathname for the @@ -518,49 +584,45 @@ Output Parameters: is considered an error. Except for on Windows systems where the default is NTFS. */ -static void ADIO_FileSysType_prefix(const char *filename, int *fstype, int *error_code) +static void ADIO_FileSysType_prefix(const char *filename, int *fstype, + ADIOI_Fns **ops, int *error_code) { - static char myname[] = "ADIO_RESOLVEFILETYPE_PREFIX"; + char * cpy = 0; + int i; *error_code = MPI_SUCCESS; + *fstype = -1; + char myname[] = "ADIO_FileSysType_prefix"; - if (!strncmp(filename, "ufs:", 4) || !strncmp(filename, "UFS:", 4)) { - *fstype = ADIO_UFS; - } - else if (!strncmp(filename, "nfs:", 4) || !strncmp(filename, "NFS:", 4)) { - *fstype = ADIO_NFS; - } - else if (!strncmp(filename, "panfs:", 6) || !strncmp(filename, "PANFS:", 6)) { - *fstype = ADIO_PANFS; - } - else if (!strncmp(filename, "xfs:", 4) || !strncmp(filename, "XFS:", 4)) { - *fstype = ADIO_XFS; - } - else if (!strncmp(filename, "pvfs2:", 6)||!strncmp(filename, "PVFS2:", 6)) { - *fstype = ADIO_PVFS2; - } - else if (!strncmp(filename, "testfs:", 7) - || !strncmp(filename, "TESTFS:", 7)) + /* search table for prefix */ + cpy = ADIOI_Strdup (filename); + ADIOI_Strlower (cpy); + + i=0; + while (fstypes[i].fileops) { - *fstype = ADIO_TESTFS; + if (!strncmp (fstypes[i].prefix, cpy, strlen(fstypes[i].prefix))) + { + *fstype = fstypes[i].fstype; + *ops = fstypes[i].fileops; + break; + } + ++i; } - else if (!strncmp(filename, "lustre:", 7) - || !strncmp(filename, "LUSTRE:", 7)) + + if (-1 == *fstype) { - *fstype = ADIO_LUSTRE; - } - else if (!strncmp(filename, "gpfs:", 5) || !strncmp(filename, "GPFS:", 5)) { - *fstype = ADIO_GPFS; - } - else { *fstype = 0; - /* --BEGIN ERROR HANDLING-- */ - *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, - myname, __LINE__, MPI_ERR_NO_SUCH_FILE, - "**filename", "**filename %s", filename); - /* --END ERROR HANDLING-- */ + /* --BEGIN ERROR HANDLING-- */ + *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, + myname, __LINE__, MPI_ERR_NO_SUCH_FILE, + "**filename", "**filename %s", filename); + /* --END ERROR HANDLING-- */ + *error_code = MPI_ERR_UNKNOWN; } + ADIOI_Free (cpy); } + /*@ ADIO_ResolveFileType - determines file system type and operations from file name string; this is a collective call @@ -588,10 +650,11 @@ void ADIO_ResolveFileType(MPI_Comm comm, const char *filename, int *fstype, static char myname[] = "ADIO_RESOLVEFILETYPE"; char * p; + *ops = 0; file_system = -1; if (filename == NULL) { - *error_code = ADIOI_Err_create_code(myname, filename, ENOENT); - return; + *error_code = ADIOI_Err_create_code(myname, filename, ENOENT); + return; } tmp = strchr(filename, ':'); if (!tmp) { @@ -655,7 +718,7 @@ void ADIO_ResolveFileType(MPI_Comm comm, const char *filename, int *fstype, * * perhaps we should have this code go through the allreduce as well? */ - ADIO_FileSysType_prefix(filename, &file_system, &myerrcode); + ADIO_FileSysType_prefix(filename, &file_system, ops, &myerrcode); if (myerrcode != MPI_SUCCESS) { *error_code = myerrcode; return; @@ -671,94 +734,35 @@ void ADIO_ResolveFileType(MPI_Comm comm, const char *filename, int *fstype, * including the colon! */ p = getenv("ROMIO_FSTYPE_FORCE"); if (p != NULL) { - ADIO_FileSysType_prefix(p, &file_system, &myerrcode); + ADIO_FileSysType_prefix(p, &file_system, ops, &myerrcode); if (myerrcode != MPI_SUCCESS) { *error_code = myerrcode; return; } } - - /* verify that we support this file system type and set ops pointer */ - if (file_system == ADIO_UFS) { -#ifndef ROMIO_UFS - *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, - myname, __LINE__, MPI_ERR_IO, - "**iofstypeunsupported", 0); - return; -#else - *ops = &ADIO_UFS_operations; -#endif - } - if (file_system == ADIO_NFS) { -#ifndef ROMIO_NFS - *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, - myname, __LINE__, MPI_ERR_IO, - "**iofstypeunsupported", 0); - return; -#else - *ops = &ADIO_NFS_operations; -#endif - } - if (file_system == ADIO_PANFS) { -#ifndef ROMIO_PANFS - *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, - myname, __LINE__, MPI_ERR_IO, - "**iofstypeunsupported", 0); - return; -#else - *ops = &ADIO_PANFS_operations; -#endif - } - if (file_system == ADIO_XFS) { -#ifndef ROMIO_XFS - *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, - myname, __LINE__, MPI_ERR_IO, - "**iofstypeunsupported", 0); - return; -#else - *ops = &ADIO_XFS_operations; -#endif - } - if (file_system == ADIO_PVFS2) { -#ifndef ROMIO_PVFS2 - *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, - myname, __LINE__, MPI_ERR_IO, - "**iofstypeunsupported", 0); - return; -#else - *ops = &ADIO_PVFS2_operations; -#endif - } - if (file_system == ADIO_TESTFS) { -#ifndef ROMIO_TESTFS - *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, - myname, __LINE__, MPI_ERR_IO, - "**iofstypeunsupported", 0); - return; -#else - *ops = &ADIO_TESTFS_operations; -#endif + /* verify that we support this file system: look for filesystem in the + * fstypes tables */ + if (! (*ops)) + { + int i=0; + while (fstypes[i].fileops) + { + if (file_system == fstypes[i].fstype) + { + *ops = fstypes[i].fileops; + break; + } + ++i; + } } - - if (file_system == ADIO_GPFS) { -#ifndef ROMIO_GPFS + if (! (*ops)) + { *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, - myname, __LINE__, MPI_ERR_IO, - "**iofstypeunsupported", 0); + myname, __LINE__, MPI_ERR_IO, + "**iofstypeunsupported", 0); return; -#else - *ops = &ADIO_GPFS_operations; -#endif } - if (file_system == ADIO_LUSTRE) { -#ifndef ROMIO_LUSTRE - *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_IO, "**iofstypeunsupported", 0); - return; -#else - *ops = &ADIO_LUSTRE_operations; -#endif - } *error_code = MPI_SUCCESS; *fstype = file_system; return; diff --git a/src/mpi/romio/adio/common/ad_open.c b/src/mpi/romio/adio/common/ad_open.c index 5cdb94c7a41..fec0dcfcb7b 100644 --- a/src/mpi/romio/adio/common/ad_open.c +++ b/src/mpi/romio/adio/common/ad_open.c @@ -5,6 +5,7 @@ * See COPYRIGHT notice in top-level directory. */ +#include #include "adio.h" #include "adio_extern.h" #include "adio_cb_config_list.h" @@ -58,7 +59,9 @@ MPI_File ADIO_Open(MPI_Comm orig_comm, fd->file_system = file_system; fd->fs_ptr = NULL; - fd->fns = ops; + fd->fns = (ADIOI_Fns *) ADIOI_Malloc(sizeof(ADIOI_Fns)); + /* make a copy of the fns struct for now */ + *fd->fns = *ops; fd->disp = disp; fd->split_coll_count = 0; @@ -67,6 +70,7 @@ MPI_File ADIO_Open(MPI_Comm orig_comm, fd->etype = etype; /* MPI_BYTE by default */ fd->filetype = filetype; /* MPI_BYTE by default */ fd->etype_size = 1; /* default etype is MPI_BYTE */ + fd->datarep = ADIOI_Strdup ("native"); /* default datarep is native */ fd->file_realm_st_offs = NULL; fd->file_realm_types = NULL; @@ -205,6 +209,8 @@ MPI_File ADIO_Open(MPI_Comm orig_comm, ADIOI_Free(fd->hints->ranklist); ADIOI_Free(fd->hints->cb_config_list); ADIOI_Free(fd->hints); + ADIOI_Free(fd->datarep); + ADIOI_Free(fd->fns); if (fd->info != MPI_INFO_NULL) MPI_Info_free(&(fd->info)); ADIOI_Free(fd->io_buf); ADIOI_Free(fd); diff --git a/src/mpi/romio/adio/common/ad_set_view.c b/src/mpi/romio/adio/common/ad_set_view.c index 449cf220391..76db62d2a88 100644 --- a/src/mpi/romio/adio/common/ad_set_view.c +++ b/src/mpi/romio/adio/common/ad_set_view.c @@ -60,11 +60,18 @@ void ADIO_Set_view(ADIO_File fd, ADIO_Offset disp, MPI_Datatype etype, ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if (filetype_is_contig) fd->fp_ind = disp; else { - flat_file = ADIOI_Flatten_and_find(fd->filetype); - for (i=0; icount; i++) { - if (flat_file->blocklens[i]) { - fd->fp_ind = disp + flat_file->indices[i]; - break; + if (fd->file_system == ADIO_LOGFS) { + fd->fp_ind = disp; + /* skip the type processing for logfs. A logfs write will already take + * into account the lower bound markers */ + ; + } else { + flat_file = ADIOI_Flatten_and_find(fd->filetype); + for (i=0; icount; i++) { + if (flat_file->blocklens[i]) { + fd->fp_ind = disp + flat_file->indices[i]; + break; + } } } } diff --git a/src/mpi/romio/adio/common/ad_tuning.c b/src/mpi/romio/adio/common/ad_tuning.c index fd39213ca9c..5ad258bbb13 100644 --- a/src/mpi/romio/adio/common/ad_tuning.c +++ b/src/mpi/romio/adio/common/ad_tuning.c @@ -27,6 +27,7 @@ int romio_onesided_no_rmw; int romio_onesided_always_rmw; int romio_onesided_inform_rmw; int romio_tunegather; +int romio_agg_map_policy; /* set internal variables for tuning environment variables */ /** \page mpiio_vars MPIIO Configuration @@ -108,4 +109,18 @@ void ad_get_env_vars() { romio_tunegather = 1; x = getenv( "ROMIO_TUNEGATHER" ); if (x) romio_tunegather = atoi(x); +/* + * - AGG_MAP_POLICY - Define aggregator layout among nodes + * Possible values: + * - 0 - Original layout (blocked layout in each node) + * - 1 - Round-robin layout if # procs per node exceeds 1 + * - Default is 0 + */ + + romio_agg_map_policy = 0; + x = getenv("AGG_MAP_POLICY"); + if (x) romio_agg_map_policy = atoi(x); +#ifdef CB_CONFIG_LIST_DEBUG + FPRINTF(stderr, "romio_agg_map_policy=%d\n", romio_agg_map_policy); +#endif } diff --git a/src/mpi/romio/adio/common/growvector.c b/src/mpi/romio/adio/common/growvector.c new file mode 100644 index 00000000000..5923d2cecd1 --- /dev/null +++ b/src/mpi/romio/adio/common/growvector.c @@ -0,0 +1,51 @@ +#include +#include "growvector.h" + +void growvector_clear (growvector_handle handle) +{ + handle->size = 0; +} + + +void growvector_free (growvector_handle * handle) +{ + free ((*handle)->data); + free(*handle); + *handle = 0; +} + +growvector_handle growvector_create (int elesize, int cap) +{ + growvector_handle handle = (growvector_handle) + malloc (sizeof(struct growvector_instance)); + + assert (elesize); + + handle->data =0; + handle->size = 0; + handle->capacity = 0; + handle->elesize = elesize; + + return handle; +} + +int growvector_reserve (growvector_handle handle, int wanted, int strict) +{ + if (!strict && handle->capacity > wanted) + return handle->capacity; + + assert (wanted >= handle->size); + + handle->data = realloc (handle->data, handle->elesize * wanted); + assert (handle->data); + handle->capacity = wanted; + + return handle->capacity; +} + +void growvector_grow (growvector_handle handle) +{ + growvector_reserve (handle, (handle->capacity ? handle->capacity*2 + : (GROWVECTOR_MINSIZE / handle->elesize)), 0); + assert (handle->capacity > handle->size); +} diff --git a/src/mpi/romio/adio/common/layered.c b/src/mpi/romio/adio/common/layered.c new file mode 100644 index 00000000000..0f5e1d00429 --- /dev/null +++ b/src/mpi/romio/adio/common/layered.c @@ -0,0 +1,120 @@ +#include "layered.h" + +int ADIOI_Layer_SetInfo (ADIO_File fd, MPI_Info users_info, + int * error_code) +{ + if (!fd->fs_ptr) + { + /* this means we are being called from within the ADIO_Open function + * and we don't have the slave yet */ + + /* add the keys to the info argument */ + assert (fd->info == MPI_INFO_NULL); + MPI_Info_dup (users_info, &fd->info); + + return 1; + } + return 0; +} + +int ADIOI_Layer_fcntl (ADIO_File fd, int flag, ADIO_Fcntl_t * fcntl_struct, + int * error_code) +{ + ADIOI_Layer_data * ld; + + if (ADIO_FCNTL_SET_SLAVE != flag) + return 0; + + /* fcntl_struct points to the ADIOI_Fns_struct of the slave + * make a copy of the structure + */ + + /* validate that this fd is ready for layered operations */ + ADIOI_Layer_validate (fd); + ld = (ADIOI_Layer_data *) fd->fs_ptr; + + /* check that we didn't have this call already */ + // assert (0 == ld->slave_ops); + + /* the slave is set in ADIO_Layer_init */ + assert (ld->slave_ops); + + + *error_code = MPI_SUCCESS; + return 1; +} + +void ADIOI_Layer_init (ADIO_File fd, ADIOI_Fns * fns, void * masterdata, + int * error_code, int already_open) +{ + void * handle; + ADIOI_Layer_data * layerdata = ADIOI_Malloc (sizeof(ADIOI_Layer_data)); + + layerdata->magic = ROMIO_LAYER_MAGIC; + layerdata->master_data = masterdata; + + /* save original fns pointer for malloc/free consistency issues */ + layerdata->orig_fns = fd->fns; + + /* if already_openened, the fd->fns points to the slave ops, + * if not, fd->fns points to the masters ops */ + layerdata->master_ops = (ADIOI_Fns*) ADIOI_Malloc (sizeof (ADIOI_Fns)); + *layerdata->master_ops = (already_open ? *fns : *fd->fns); + + layerdata->slave_data = fd->fs_ptr; + layerdata->slave_ops = (ADIOI_Fns*) ADIOI_Malloc (sizeof (ADIOI_Fns)); + /* take copy */ + /* if already_openened, the slave is already active and fd->fns points to + * the slave ops */ + *layerdata->slave_ops = (already_open ? *fd->fns : *fns); + + fd->fs_ptr = layerdata; + + /* --- now the layering is initialized --- */ + + if (!already_open) + { + /* open the slave */ + handle = ADIOI_Layer_switch_in (fd); + fd->fns->ADIOI_xxx_Open (fd, error_code); + ADIOI_Layer_switch_out (fd, handle); + + if (MPI_SUCCESS == error_code) + { + /* inform master that slave is opened and ready */ + fd->fns->ADIOI_xxx_Fcntl (fd, ADIO_FCNTL_SET_SLAVE, + 0, error_code); + } + } + else + { + fd->fns = layerdata->master_ops; + } +} + +void * ADIOI_Layer_done (ADIO_File fd) +{ + void * ret ; + ADIOI_Layer_validate (fd); + + /* switchin restores the slave and returns the ADIOI_Layer_data struct */ + //ADIOI_Layer_data * d = ADIOI_Layer_switch_in (fd); + ADIOI_Layer_data * d = (ADIOI_Layer_data *) fd->fs_ptr; + ret = d->master_data; + + fd->fns = d->orig_fns; + *fd->fns = *d->slave_ops; + + ADIOI_Free (d->master_ops); + ADIOI_Free (d->slave_ops); + ADIOI_Free (d); + return ret; +} + + +int ADIOI_Layer_is_slave_set (ADIO_File fd) +{ + ADIOI_Layer_validate (fd); + return (0 != ((ADIOI_Layer_data*) fd->fs_ptr)->slave_ops); +} + diff --git a/src/mpi/romio/adio/common/memstack.c b/src/mpi/romio/adio/common/memstack.c new file mode 100644 index 00000000000..8907d0fd7b5 --- /dev/null +++ b/src/mpi/romio/adio/common/memstack.c @@ -0,0 +1,191 @@ +#include +#include +#include "memstack.h" + +struct memstack_memblock +{ + int size; /* nb of elements in block */ + int used; /* nb of used elements inblock */ + struct memstack_memblock * prev; + struct memstack_memblock * next; + char * data; +}; + +typedef struct memstack_memblock memstack_memblock; + + +struct memstack_instance +{ + int size; /* nb of elements in stack */ + int elesize; /* bytes / element */ + int spare; /* spare capacity */ + + memstack_memblock * mem; /* points to current head block */ + +}; + +typedef struct memstack_instance memstack_instance; + + +memstack_handle memstack_create (int elesize) +{ + memstack_handle ret = malloc (sizeof(memstack_instance)); + assert (ret); + + ret->size = 0; + ret->elesize = elesize; + ret->mem = 0; + ret->spare = 0; + return ret; +} + +int memstack_getsize (memstack_consthandle handle) +{ + return handle->size; +} + +static void memstack_addblock (memstack_handle handle) +{ + int blocksize = (MEMSTACK_BLOCKSIZE * 1024) / (handle->elesize); + assert (blocksize); + + char * newmem = malloc (blocksize*handle->elesize); + memstack_memblock * newnode = malloc (sizeof(memstack_memblock)); + + newnode->data = newmem; + newnode->size = blocksize; + newnode->prev = 0; + newnode->next = 0; + newnode->used = 0; + + handle->spare += blocksize; + + if (!handle->mem) + { + handle->mem = newnode; + return; + } + + /* add block after current head */ + newnode->prev = handle->mem; + newnode->next = handle->mem->next; + handle->mem->next = newnode; + if (newnode->next) + newnode->next->prev = newnode; +} + +void memstack_reducemem (memstack_handle handle) +{ + memstack_memblock * cur = handle->mem; + + + /* since mem always pointss to the current head, only need to look forward + * */ + while (cur) + { + memstack_memblock * tmp; + + /* see if we can free remove the block; if not, try the next one */ + if (cur->used) + { + cur = cur->next; + continue; + } + + /* we can free the current block */ + tmp = cur; + cur = cur->next; + + free (tmp->data); + if (tmp->prev) + tmp->prev->next = tmp->next; + if (tmp->next) + tmp->next->prev = tmp->prev; + + handle->spare -= tmp->size; + free (tmp); + } + + /* only way the head gets freed if is the total size is 0 */ + if (!handle->size) + handle->mem = 0; +} + +/* erase all elements, keeping memory blocks (to reduce memory, + * call memstack_reducemem) */ +void memstack_clear (memstack_handle handle) +{ + memstack_memblock * cur = handle->mem; + + if (!handle->size) + return; + + /* find most left (prev) block, set the root pointer to it + * and clear count */ + assert (cur); + + while (cur && cur->prev) + cur=cur->prev; + + handle->mem = cur; + handle->spare += handle->size; + handle->size = 0; +} + +void memstack_free (memstack_handle * handle) +{ + memstack_clear (*handle); + memstack_reducemem (*handle); + + assert (0==(*handle)->size); + assert (0==(*handle)->spare); + + free ((*handle)); + *handle = 0; +} + +char * memstack_push (memstack_handle handle) +{ + char * ret; + + if (!handle->mem || (handle->mem->used == handle->mem->size)) + { + assert (0==handle->spare); + memstack_addblock (handle); + assert (handle->spare == handle->mem->size); + assert (0==handle->mem->used); + } + + assert (handle->mem); + assert (handle->spare); + + ret = handle->mem->data + (handle->elesize * handle->mem->used); + + ++handle->mem->used; + ++handle->size; + --handle->spare; + + return ret; +} + +char * memstack_pop (memstack_handle handle) +{ + char * ret = 0; + assert (handle->size); + if (!handle->size) + return ret; + + assert (handle->mem); + assert (handle->mem->used); + + /* first decrease page usage counter and use as index in page */ + --handle->mem->used; + ret = handle->mem->data + (handle->mem->used * handle->elesize); + --handle->size; + ++handle->spare; + + if (!handle->mem->used && handle->mem->prev) + handle->mem = handle->mem->prev; + + return ret; +} diff --git a/src/mpi/romio/adio/common/rtree.c b/src/mpi/romio/adio/common/rtree.c new file mode 100644 index 00000000000..814ee1071d9 --- /dev/null +++ b/src/mpi/romio/adio/common/rtree.c @@ -0,0 +1,2027 @@ +#include +#include +#include + +#ifdef USE_EFENCE +#include +#endif + +#include +#include + +#include "rtree.h" + + +#define RTREE_CHILD_MIN 1 /* minimum children in node */ +#define RTREE_CHILD_MAX 4 /* maximum number of children */ + +#ifndef FALSE +#define FALSE 0 +#endif + +#ifndef TRUE +#define TRUE 1 +#endif + +/* + * TODO: + * - Make MIN/MAX child configurable at tree compilation time? + */ +struct rtree_node; + +struct rtree_entry +{ + rtree_range range; + RTREE_DATA_TYPE data; +} ; + +typedef struct rtree_entry rtree_entry; + +struct rtree_node +{ + rtree_range range; + struct rtree_node * parent; + union + { + struct rtree_node * child[RTREE_CHILD_MAX]; + struct rtree_entry * entry[RTREE_CHILD_MAX]; + }; +} ; + +typedef struct rtree_node rtree_node; + + +struct rtree +{ + struct rtree_node * root; /* pointer to root node */ + int depth; /* index of lowest level */ + int count; /* number of data items in the tree */ + + rtree_callback freefunc; /* called when removing an item from the tree */ + void * freedata; /* extra pointer passed to the free function */ + + rtree_callback_split splitfunc; /* pointer to split function */ + void * splitdata; /* extra data */ +}; + +typedef struct rtree rtree; + +typedef int (*rtree_callback_visit) (rtree_const_handle tree, const rtree_node * node, + int depth, void * extra); + + /*======== iterator functions ===========*/ + +struct rtree_iterator +{ + int createdepth; + int * childnum; + rtree_node * node; + rtree_handle tree; + int depth; +}; + +typedef struct rtree_iterator rtree_iterator; + +/**************************************************************************** + * Overlap helper functions * + ****************************************************************************/ + +static inline RTREE_RANGE_TYPE rtree_range_type_min + (RTREE_RANGE_TYPE a, RTREE_RANGE_TYPE b); + +static inline RTREE_RANGE_TYPE rtree_range_type_max + (RTREE_RANGE_TYPE a, RTREE_RANGE_TYPE b); + +/* Return true if the ranges intersect */ +static inline int rtree_range_has_overlap (const rtree_range * range1, + const rtree_range * range2); + +/* If there is overlap, return true and return common range*/ +static inline int rtree_range_shared (const rtree_range * range1, + const rtree_range * range2, rtree_range * dest); + +/* Calculate the smallest range containing both rectangles */ +static inline void rtree_range_extent (const rtree_range * range1, + const rtree_range * range2, rtree_range * dest); + +/* Extend range by adding range2; Store result in range1 */ +static inline void rtree_range_extend (rtree_range * range1, + const rtree_range * range2); + +/* Order rectangles, range1 first */ +static inline void rtree_range_order (const rtree_range * * range1, + const rtree_range * * range2); + +/* return size of the range */ +static inline RTREE_RANGE_TYPE rtree_range_size (const rtree_range * r); + +/* calculate how much the range size increases when adding the r2 to r1 */ +static inline RTREE_RANGE_TYPE rtree_range_calc_extension (const rtree_range * r1, + const rtree_range * r2); + +/* Check if the range is empty */ +static inline int rtree_range_empty (const rtree_range * r); + +/* check if the ranges are equal */ +static inline int rtree_range_equals (const rtree_range * r1, const rtree_range * r2); + +/* check if the first range contains all of the second range */ +static inline int rtree_range_contains (const rtree_range * r1, const rtree_range * r2); + +/**************************************************************************** + * Memory functions * + ****************************************************************************/ + +static inline void * rtree_mem_alloc_raw (int bytes) +{ + void * ptr = malloc (bytes); +#ifdef RTREE_INIT_MEM + if (ptr) + memset (ptr, 0, bytes); +#endif + return ptr; +} + +static inline void rtree_mem_free_raw (void * ptr, int bytes) +{ +#ifdef RTREE_CLEAR_MEM + memset (ptr, 0, bytes); +#endif + free (ptr); +} + +static inline rtree_iterator * rtree_mem_alloc_iterator () +{ + rtree_iterator * i = rtree_mem_alloc_raw (sizeof (rtree_iterator)); + return i; +} + +static inline void rtree_mem_free_iterator (rtree_iterator ** iter) +{ + assert (iter); + rtree_mem_free_raw (*iter, sizeof(rtree_iterator)); + *iter=0; +} + +static inline rtree_node * rtree_mem_alloc_node () +{ + int i; + rtree_node * n = rtree_mem_alloc_raw (sizeof (rtree_node)); + + n->range.start = n->range.stop = 0; + n->parent = 0; + for (i=0; ichild[i] = 0; + return n; +} + +static inline void rtree_mem_free_node (rtree_node ** node) +{ + assert (node); + rtree_mem_free_raw (*node, sizeof(rtree_node)); + *node=0; +} + +static inline void rtree_mem_free_tree (rtree ** tree) +{ + assert (tree); + rtree_mem_free_raw (*tree, sizeof(rtree)); + *tree = 0; +} + +static inline rtree * rtree_mem_alloc_tree () +{ + return rtree_mem_alloc_raw (sizeof (rtree)); +} + +static inline rtree_entry * rtree_mem_alloc_entry + (const rtree_range * range, RTREE_DATA_TYPE data) +{ + rtree_entry * new = rtree_mem_alloc_raw (sizeof (rtree_entry)); + new->range = *range; + new->data = data; + return new; +} + +static inline void rtree_mem_free_entry (rtree_entry ** entry) +{ + assert (entry); + rtree_mem_free_raw (*entry, sizeof(rtree_entry)); + *entry = 0; +} + + + + +/**************************************************************************** + * Public rtree functions * + ****************************************************************************/ +static int rtree_node_compare_entry (const void * n1, const void * n2) +{ + if (*(const rtree_entry **)n1 == 0) + return ( *(const rtree_node **) n2 == 0 ? 0 : -1); + + if (*(const rtree_entry **)n2 == 0) + /* we know that n1 cannot be 0 here */ + return 1; + + /* n1 and n2 != 0 */ + RTREE_RANGE_TYPE s1 = (*(const rtree_entry **)n1)->range.start; + RTREE_RANGE_TYPE s2 = (*(const rtree_entry **)n2)->range.start; + if (s1 < s2) + return -1; + if (s1 > s2) + return 1; + return 0; +} + + +static int rtree_node_compare_node (const void * n1, const void * n2) +{ + if (*(const rtree_node **)n1 == 0) + return ( *(const rtree_node **) n2 == 0 ? 0 : -1); + + if (*(const rtree_node **)n2 == 0) + /* we know that n1 cannot be 0 here */ + return 1; + + /* n1 and n2 != 0 */ + RTREE_RANGE_TYPE s1 = (*(const rtree_node **)n1)->range.start; + RTREE_RANGE_TYPE s2 = (*(const rtree_node **)n2)->range.start; + if (s1 < s2) + return -1; + if (s1 > s2) + return 1; + return 0; +} + +/* sort the child/entry pointers in a node so that iteraters will traverse + * the tree in order and so intra-node searches can be made faster */ +static void rtree_node_sort (rtree_node * node, int count, int leaf) +{ + /* probably very bad for already sorted sequences! Consider other + * sorting algorithms */ + qsort ((leaf ? (void *) &node->child[0] : (void *) &node->entry[0]), + count, + sizeof (rtree_node *), + (leaf ? rtree_node_compare_node : rtree_node_compare_entry)); + +} + +static void rtree_find_internal (const rtree_node * node, + const rtree_range * range, int depth, int treedepth, + const rtree_node ** nodeptr, int * entrynum) +{ + int i; + + assert (node); + assert (nodeptr); + + if (depth != treedepth) + { + for (i=0; ichild[i]) + break; + + if (rtree_range_contains (&node->child[i]->range, range)) + { + rtree_find_internal (node->child[i], + range, depth+1, treedepth, nodeptr, entrynum); + + /* if this subtree contains the item, return + * otherwise keep looking since another child could also contain + * the item */ + if (*nodeptr) + return; + } + } + } + else + { + for (i=0; ientry[i]) + break; + + if (rtree_range_equals(range, &node->entry[i]->range)) + { + /* found item */ + *nodeptr = node; + *entrynum = i; + return; + } + } + } + + /* didn't find the range */ + return; +} + +RTREE_DATA_TYPE * rtree_find (rtree_const_handle tree, + const rtree_range * range) +{ + const rtree_node * nodeptr = 0; + int entrynum = 0; + + assert (tree); + rtree_find_internal (tree->root, range, 0, tree->depth, + &nodeptr, &entrynum); + + if (nodeptr) + return &nodeptr->entry[entrynum]->data; + else + return 0; +} + + +static int rtree_overlap_internal (const rtree_node * node, const rtree_range * range, + rtree_callback callback, int depth, int treedepth, void * extra) +{ + int i; + + assert (node); + + if (depth == treedepth) + { + /* At lowest level; We have pointers to entries */ + for (i=0; ientry[i]) + { + assert ((depth == 0 && i>=0) || + (depth > 0 && i>=1)); + break; + } + + if (rtree_range_has_overlap (range, &node->entry[i]->range)) + { + /* Found qualifying entry */ + if (!callback (&node->entry[i]->range, &node->entry[i]->data, + extra)) + return FALSE; + } + } + return TRUE; + } + + /* Not at lowest level yet; we have child nodes */ + for (i=0; ichild[i]) + { + assert (i>0); + break; + } + if (rtree_range_has_overlap (range, &node->child[i]->range)) + { + /* search subtree */ + if (!rtree_overlap_internal (node->child[i], range, callback, + depth+1, treedepth, extra)) + return FALSE; + } + } + return TRUE; +} + +/* + * Check all children, ignore subtrees that do not overlap + * Return false if the callback returned false + * otherwise return true + */ +int rtree_overlap (rtree_const_handle tree, const rtree_range * range, + rtree_callback callback, void * extra) +{ + assert (tree->root); + + return rtree_overlap_internal (tree->root, range, callback, 0, + tree->depth, extra); +} + + +void rtree_free_node (rtree_const_handle tree, rtree_node * node, int depth, int treedepth) +{ + int i; + if (depth == treedepth) + { + /* node contains pointers to entries */ + for (i=0; ientry[i]) + break; + + if (tree->freefunc) + { + tree->freefunc (&node->entry[i]->range, + &node->entry[i]->data, tree->freedata); + } + rtree_mem_free_entry (&node->entry[i]); + } + } + else + { + /* free all child nodes, then free our own node */ + for (i=0; ichild[i]) + break; + rtree_free_node (tree, node->child[i], depth+1, treedepth); + } + } + rtree_mem_free_node (&node); +} + +void rtree_free (rtree_handle * rtree) +{ + if (!rtree) return; + if (*rtree == NULL) return; + if ((*rtree)->root == NULL) return; + + rtree_free_node(*rtree, (*rtree)->root, 0, (*rtree)->depth); + (*rtree)->root = 0; /* rtree_free_node already frees mem */ + + rtree_mem_free_tree (rtree); +} + +int rtree_empty (rtree_const_handle tree) +{ + + /* null tree must be empty, right?*/ + if (!tree) return 1; + return (rtree_get_count (tree) == 0); +} + +void rtree_clear (rtree_handle rtree) +{ + if (!rtree->count) + return; + + assert (rtree->root); + + rtree_free_node (rtree, rtree->root, 0, rtree->depth); + + /* rtree_free_node also free'd the root node itself, + * and we want every rtree (also an empty on) to have a valid rtree node + * so create a new one */ + + rtree->root = rtree_mem_alloc_node(); + rtree->root->range.start = 0; + rtree->root->range.stop = 0; + + rtree->count = 0; + rtree->depth = 0; +} + + +rtree_handle rtree_create () +{ + int i; + rtree_handle ret = rtree_mem_alloc_tree(); + assert (ret); + + ret->depth = 0; + ret->root = rtree_mem_alloc_node(); + ret->root->range.start = 0; + ret->root->range.stop = 0; + + ret->count = 0; + + for (i=0; iroot->entry[i]=0; + + rtree_set_splitfunc (ret, 0, 0); + rtree_set_freefunc (ret, 0, 0); + + return ret; +} + +/* + * Find the (possibly leaf) node at depth wanted that will cause the least extension + * of the bounding box when adding the specified range + */ +static rtree_node * rtree_add_choosenode (rtree_node * node, const rtree_range * range, + int depth, int treedepth, int wanted) +{ + int i; + int childcount = RTREE_CHILD_MAX; + int addchild; + + RTREE_RANGE_TYPE addrange; + RTREE_RANGE_TYPE addincrease; + RTREE_RANGE_TYPE tmprange; + RTREE_RANGE_TYPE increase[RTREE_CHILD_MAX]; + + /* cannot be looking below desired depth */ + assert (depth <= wanted); + + if (depth == wanted) + return node; + + /* calculate increase of the node when adding the rectangle there */ + for (i=0; ichild[i]) + { + childcount = i; + break; + } + increase[i] = rtree_range_calc_extension (&node->child[i]->range, range); + } + + /* select the one with the least increase */ + addchild = 0; + addincrease = increase[0]; + addrange = rtree_range_size(&node->child[addchild]->range); + + assert (childcount > 0); + for (i=1; i addincrease) + continue; + + /* if the increase is the same, keep the one that + * has the smallest range */ + if (increase[i] == addincrease) + { + tmprange = rtree_range_size(&node->child[i]->range); + if (tmprange >= addrange) + continue; + } + + addchild=i; + addrange =tmprange; + addincrease=increase[addchild]; + } + + /* we found the best child node to insert the new range */ + return rtree_add_choosenode (node->child[addchild], range, + depth+1, treedepth, wanted); +} + +/* Partition the ranges from source in two sections; + * Denoted by 0 or 1 in mapping + * source and mapping are of size count; + * All entries need to be checked [there could be 0 pointers in the middle] + */ +static void rtree_add_splitnode_decide (const rtree_range ** source, + int * mapping, int count) +{ + int i; + RTREE_RANGE_TYPE mostleft; + RTREE_RANGE_TYPE mostright; + int mostrightid, mostleftid; + + assert (source); + assert (source[0]); + + mostleft = source[0]->start; + mostright = source[0]->stop; + mostleftid = mostrightid = 0; + + /* find the points most to the left and to the right */ + for (i=0; istart < mostleft) + { + mostleft = source[i]->start; + mostleftid = i; + } + + if (source[i]->stop > mostright) + { + mostright = source[i]->stop; + mostrightid = i; + } + } + + if (mostleftid == mostrightid) + { + fprintf(stderr, "rtree: complete overlap detected; node splitting" + " doesn't handle this well!\n"); + /* select another one */ + if (mostleftid == (count-1) || !source[mostleftid+1]) + mostrightid=0; + else + mostrightid=mostleftid+1; + } + + assert (mostleftid != mostrightid); + assert (source[mostleftid]); + assert (source[mostrightid]); + mapping[mostleftid]=0; + mapping[mostrightid]=1; + + /* go over rest and decide which side they belong on */ + for (i=0; istop - source[mostleftid]->stop; + rightdiff = source[mostrightid]->start - source[i]->start; + mapping[i] = (leftdiff > rightdiff ? 1 : 0); + } +} + + +/* + * Collect range pointers from the node + */ +static void rtree_add_getranges (const rtree_node * source, const rtree_range ** ranges, int leaf) +{ + int i=0; + if (leaf) + { + for (i=0; ientry[i]->range; + } + else + { + for (i=0; ichild[i]->range; + } +} + + +/* + * Fix parent pointer for non-leaf nodes + */ +static void rtree_node_fixparent (rtree_node * node) +{ + int i; + for (i=0; ichild[i]) + break; + node->child[i]->parent = node; + } +} + +/* split node [nonleaf] + * Does not update bounding range + * Sets parent pointer and the parent pointer of the children + */ +static void rtree_add_splitnode_node (rtree_const_handle tree, + rtree_node ** source, rtree_node ** s1, + rtree_node ** s2, rtree_node * newchild) +{ + int i; + int mapping[RTREE_CHILD_MAX+1]; + const rtree_range * ranges[RTREE_CHILD_MAX+1]; + + int leftpos = 0; + int rightpos = 0; + + assert (newchild); + + ranges[RTREE_CHILD_MAX] = &newchild->range; + rtree_add_getranges (*source, ranges,FALSE); + + tree->splitfunc (ranges, mapping, RTREE_CHILD_MAX+1, + tree->splitdata); + + *s1 = rtree_mem_alloc_node (); + *s2 = rtree_mem_alloc_node (); + + (*s1)->parent = (*source)->parent; + (*s2)->parent = (*source)->parent; + + /* add old children */ + for (i=0; ichild[rightpos++] = (*source)->child[i]; + else + (*s1)->child[leftpos++] = (*source)->child[i]; + + assert (mapping[i] == 0 || mapping[i] ==1 ); + } + + /* add new child */ + if (mapping[RTREE_CHILD_MAX]) + { + + (*s2)->child[rightpos++] = newchild; +#ifdef RTREE_SORT_ENTRIES + rtree_node_sort (*s2, rightpos, 0); +#endif + } + else + { + (*s1)->child[leftpos++] = newchild; +#ifdef RTREE_SORT_ENTRIES + rtree_node_sort (*s1, leftpos, 0); +#endif + } + + /* Fix parent pointer in child nodes */ + rtree_node_fixparent (*s1); + rtree_node_fixparent (*s2); + + rtree_mem_free_node (source); +} + +/* split node [must be leaf] + * Does not update bounding range + * Sets parent pointer to that of the source node + **/ +void rtree_add_splitnode_leaf (rtree_const_handle tree, + rtree_node ** source, rtree_node ** s1, rtree_node ** s2, + rtree_entry * newentry) +{ + int i; + int mapping[RTREE_CHILD_MAX+1]; + const rtree_range * ranges[RTREE_CHILD_MAX+1]; + + int leftpos = 0; + int rightpos = 0; + + assert (newentry); + + ranges[RTREE_CHILD_MAX] = &newentry->range; + rtree_add_getranges (*source, ranges,TRUE); + + /* calculate new distribution */ + tree->splitfunc (ranges, mapping, RTREE_CHILD_MAX+1, tree->splitdata); + + *s1 = rtree_mem_alloc_node (); + *s2 = rtree_mem_alloc_node (); + + (*s1)->parent = (*source)->parent; + (*s2)->parent = (*source)->parent; + + /* add old entries */ + for (i=0; ientry[rightpos++] = (*source)->entry[i]; + else + (*s1)->entry[leftpos++] = (*source)->entry[i]; + + assert (mapping[i] == 0 || mapping[i] ==1 ); + } + + /* add new entry */ + if (mapping[RTREE_CHILD_MAX]) + { + (*s2)->entry[rightpos++] = newentry; +#ifdef RTREE_SORT_ENTRIES + rtree_node_sort (*s2, rightpos, 1); +#endif + } + else + { + (*s1)->entry[leftpos++] = newentry; +#ifdef RTREE_SORT_ENTRIES + rtree_node_sort (*s1, leftpos, 1); +#endif + } + + + + /* entry nodes have no parent pointer so nothing to correct */ + rtree_mem_free_node (source); +} + + +static void rtree_node_fix_extent (rtree_node * node, int leaf) +{ + int i=0; + assert (node); + + /* could do some define hack to avoid duplicate code */ + + if (!leaf) + { + if (node->child[0]) + { + node->range = node->child[0]->range; + } + else + { + node->range.start = node->range.stop = 0; + } + + for (i=1; ichild[i]) + break; + rtree_range_extend (&node->range, &node->child[i]->range); + } + } + else + { + if (node->entry[0]) + { + node->range = node->entry[0]->range; + } + else + { + node->range.start = node->range.stop = 0; + } + + for (i=1; ientry[i]) + break; + rtree_range_extend (&node->range, &node->entry[i]->range); + } + } +} + +/* find an empty slot in the node; RTREE_CHILD_MAX if full */ +static inline int rtree_node_findempty_child (const rtree_node * n) +{ + int i; + assert (n); + for (i=0; ichild[i]) + break; + } + return i; +} + +/* find empty entry */ +static inline int rtree_node_findempty_entry (const rtree_node * n) +{ + int i; + assert (n); + for (i=0; ientry[i]) + break; + } + return i; +} + +/* + * Fixes extent of n1 and n2 if not zero; + * If n2 is non-zero, add it to the parent of n1 (which could result in node + * splits) + */ +static void rtree_add_adjusttree (rtree_handle tree, rtree_node * n1, + rtree_node * n2, int depth) +{ + rtree_node * s1 = 0 ; + rtree_node * s2 = 0; + + assert (n1); + assert (depth >= 0); + + /* adjust bounding box */ + rtree_node_fix_extent (n1, depth==tree->depth); + + if (n2) + rtree_node_fix_extent (n2, depth==tree->depth); + + /* Is n1 the root? */ + if (!n1->parent) + { + assert (!n2 || !n2->parent); + + if (n2) + { + /* split went up to the root, create new root node */ + rtree_node * newroot = rtree_mem_alloc_node (); + + newroot->parent = 0; + newroot->child[0]=n1; + newroot->child[1]=n2; + + /* fix parent pointer */ + rtree_node_fixparent (newroot); + rtree_node_fix_extent (newroot, 0 == tree->depth); + + tree->root = newroot; + ++(tree->depth); + } + + /* extent of n1, n2 and parent is ok; we're done */ + return; + } + + /* was there a split? */ + if (n2) + { + /* try to add new node to the parent of n1 */ + rtree_node * parent; + int pos; + + /* try to add the new node to the parent */ + /* the parent is for sure a normal non-leaf node */ + parent = n1->parent; + assert (n2->parent == parent); + pos = rtree_node_findempty_child (parent); + + if (pos == RTREE_CHILD_MAX) + { + const rtree_node * old = parent; + rtree_node * parentparent = parent->parent; + int j; + + /* Parent node is also full; split */ + rtree_add_splitnode_node (tree, &parent, &s1, &s2, n2); + + if (parentparent) + { + /* parent gets freed because of splitnode; but the parent of + * parent has a child pointer to parent, which needs to be updated + */ + for (j=0; jchild[j]==old) + { + parentparent->child[j] = s1; + break; + } + } + assert (j!=RTREE_CHILD_MAX); + } + } + else + { + /* room in node; add */ + assert (!parent->child[pos]); + assert (n2->parent == parent); + parent->child[pos] = n2; + + s1 = n1->parent; + s2 = 0; +#ifdef RTREE_SORT_NODES + rtree_node_sort (parent, pos+1, 0); +#endif + } + } + else + { + s1 = n1->parent; + s2 = 0; + } + + /* adjust the parent */ + rtree_add_adjusttree (tree, s1, s2, depth-1); +} + +/* try to add entry to the given node + * Does NOT update bounding rectangle + * Return true if succeeded + * */ +static int rtree_add_try_leaf (rtree_node * node, rtree_entry * new) +{ + int i; + for (i=0; ientry[i]) + { + /* found empty space */ + break; + } + } + + if (i != RTREE_CHILD_MAX) + { + node->entry[i] = new; +#ifdef RTREE_SORT_ENTRIES + /* for now, just resort the whole thing afterwards */ + rtree_node_sort (node, i+1, 1); +#endif + return 1; + } + else + return 0; +} + +/* Add entry; Does NOT update tree->count */ +static void rtree_add_entry (rtree_handle tree, + rtree_entry * newentry) +{ + rtree_node * addpoint = 0; + rtree_node * s1 = 0; + rtree_node * s2 = 0; + int done; + + /* find a leaf node where the new range fits best */ + addpoint = rtree_add_choosenode (tree->root, &newentry->range, + 0, tree->depth, tree->depth); + + /* if there is still space in the node, add it there + * This does NOT update the bounding box of the node */ + done = rtree_add_try_leaf (addpoint, newentry); + + if (done) + { + /* add worked */ + s1 = addpoint; s2 = 0; addpoint = 0; + } + else + { + int j; + rtree_node * parentparent = addpoint->parent; + rtree_node * old = addpoint; + + /* leaf node is full; Split and add new value to one of the nodes */ + /* we know this is a leaf node */ + rtree_add_splitnode_leaf (tree, &addpoint, &s1, &s2, newentry); + + /* if the node has a parent, we need to update the child pointer since + * 'addpoint' node could be freed during the split */ + if (parentparent) + { + for (j=0; jchild[j]==old) + { + parentparent->child[j] = s1; + break; + } + } + assert (j!=RTREE_CHILD_MAX); + } + } + + /* now addpoint is 0 + * It there was a split, s2 is not null; Together s1 and s2 contain + * all entries of addpoint and the newly added entry + * Adjust bounding boxes along the path to the root; + * Integrate the splitted node if needed + */ + rtree_add_adjusttree (tree, s1, s2, tree->depth); + +} + +/* try to add newnode to the given node + * Does NOT update bounding rectangle + * Return true if succeeded + * */ +static int rtree_add_try_node (rtree_node * node, rtree_node * new) +{ + int i; + for (i=0; ichild[i]) + { + /* found empty space */ + break; + } + } + +#ifndef RTREE_NODE_SORT + if (i != RTREE_CHILD_MAX) + { + node->child[i] = new; + return 1; + } + else + return 0; +#else + /* there is space, find the best position and add there */ + assert (false); +#endif +} + + +/* Try to add newnode to node 'node' */ +static void rtree_add_node (rtree_handle tree, rtree_node * node, rtree_node * newnode) +{ + rtree_node * s1; + rtree_node * s2; + int done; + + /* if there is still space in the node, add it there + * This does NOT update the bounding box of the node */ + done = rtree_add_try_node (node,newnode); + + if (done) + { + /* add worked */ + s1 = node; s2 = 0; node = 0; + } + else + { + int j; + rtree_node * parentparent = node->parent; + rtree_node * old = node; + + /* leaf node is full; Split and add new value to one of the nodes */ + /* we know this is a leaf node */ + rtree_add_splitnode_node (tree, &node, &s1, &s2, newnode); + + /* if the node has a parent, we need to update the child pointer since + * 'addpoint' node could be freed during the split */ + if (parentparent) + { + for (j=0; jchild[j]==old) + { + parentparent->child[j] = s1; + break; + } + } + assert (j!=RTREE_CHILD_MAX); + } + } + + /* now addpoint is 0 + * It there was a split, s2 is not null; Together s1 and s2 contain + * all entries of addpoint and the newly added entry + * Adjust bounding boxes along the path to the root; + * Integrate the splitted node if needed + */ + rtree_add_adjusttree (tree, s1, s2, tree->depth); + +} + + +void rtree_add (rtree_handle tree, const rtree_range * range, + RTREE_DATA_TYPE data) +{ + rtree_entry * newentry = 0; + + /* Create entry for it */ + newentry = rtree_mem_alloc_entry (range, data); + + /* add it */ + rtree_add_entry (tree, newentry); + + /* update count */ + ++tree->count; +} + + + + +int rtree_walk_internal (const rtree_node * node, rtree_callback callback, + void * extra, int depth, int treedepth ) +{ + int i; + + assert (node); + if (depth < treedepth) + { + for (i=0; ichild[i]) + { + assert (i>0); + break; + } + if (!rtree_walk_internal (node->child[i], + callback, extra, depth+1, treedepth)) + return FALSE; + } + return TRUE; + } + + /* lowest level */ + for (i=0; ientry[i]) + break; + + if (!callback (&node->entry[i]->range, + &node->entry[i]->data, extra)) + return FALSE; + } + return TRUE; +} + +int rtree_walk (rtree_const_handle tree, rtree_callback callback, void * extra) +{ + if(tree == NULL) return 1; + if (tree->root == NULL) return 1; + + return rtree_walk_internal (tree->root, callback, extra, 0, tree->depth); +} + +static int rtree_walk_all_internal (const rtree_node * node, rtree_callback_all callback, + rtree_callback_all_info * info, int depth, int treedepth ) +{ + int i; + + assert (node); + + + /* show the current node */ + info->depth = depth; + assert (info->treedepth == treedepth); + info->nodeid = (void *) node; + info->parentid = node->parent; + info->data = 0; + info->range = &node->range; + + if (!callback (info)) + return FALSE; + + if (depth < treedepth) + { + for (i=0; ichild[i]) + { + assert (i>0); + break; + } + + if (!rtree_walk_all_internal (node->child[i], + callback, info, depth+1, treedepth)) + return FALSE; + } + return TRUE; + } + + /* lowest level */ + info->parentid = (void *) node; + info->depth = depth + 1; + for (i=0; ientry[i]) + break; + + /* show the entries of the leaf node; depth will be > treedepth */ + info->nodeid = node->entry[i]; + info->range = &node->entry[i]->range; + info->data = &node->entry[i]->data; + if (!callback (info)) + return FALSE; + } + return TRUE; +} + + +int rtree_walk_all (rtree_const_handle tree, rtree_callback_all callback, void * extra) +{ + assert (tree); + assert (tree->root); + + rtree_callback_all_info info; + info.extra =extra; + info.treedepth = tree->depth; + info.tree = tree; + + + return rtree_walk_all_internal (tree->root, callback, &info, 0, tree->depth); +} + + +void rtree_get_range (rtree_const_handle tree, rtree_range * range) +{ + assert (range); + assert (tree); + assert (tree->root); + *range = tree->root->range; +} + +int rtree_get_depth (rtree_const_handle tree) +{ + assert (tree); + return tree->depth; +} + + +static inline void rtree_dump_internal_indent (int amount) +{ + int i; + for (i=0; irange.start, node->range.stop); + + if (depth == tree->depth) + { + for (i=0; ientry[i]) + break; + rtree_dump_internal_indent (2*depth+1); + printf ("=> ENTRY [" RTREE_RANGE_TYPE_PRINTF "," RTREE_RANGE_TYPE_PRINTF "[ " + RTREE_DATA_TYPE_PRINTF "\n", node->entry[i]->range.start, + node->entry[i]->range.stop, node->entry[i]->data); + } + } + return TRUE; +} + +static int rtree_visit_nodes_internal (rtree_const_handle tree, const rtree_node * node, + int depth, rtree_callback_visit func, void * extra) +{ + int i; + if (!func (tree, node, depth, extra)) + return FALSE; + + if (depth < tree->depth) + { + for (i=0; ichild[i]) + break; + if (!rtree_visit_nodes_internal (tree, node->child[i], depth+1, + func, extra)) + return FALSE; + } + } + return TRUE; +} + +/* + * For every node, call func with extra + * Stops at the leaf nodes + */ +static int rtree_visit_nodes (rtree_const_handle tree, rtree_callback_visit func, void * extra) +{ + return rtree_visit_nodes_internal (tree, tree->root, 0, func, extra); +} + +void rtree_dump (rtree_const_handle tree) +{ + assert (tree); + rtree_visit_nodes (tree, rtree_dumpfunc, 0); +} + +int rtree_get_child_min (rtree_const_handle tree) +{ + assert (tree); + return RTREE_CHILD_MIN; +} + +int rtree_get_child_max (rtree_const_handle tree) +{ + assert (tree); + return RTREE_CHILD_MAX; +} + +int rtree_get_count (rtree_const_handle tree) +{ + assert (tree); + return tree->count; +} + +void rtree_set_freefunc (rtree_handle tree, rtree_callback func, + void * extra) +{ + assert (tree); + tree->freedata = extra; + tree->freefunc = func; +} + + +static void rtree_default_splitfunc (const rtree_range ** sources, + int * mapping, int count, void * extra) +{ + rtree_add_splitnode_decide(sources,mapping,count); +} + +void rtree_set_splitfunc (rtree_handle tree, rtree_callback_split func, + void * extra) +{ + if (!func) + { + tree->splitdata = 0; + tree->splitfunc = rtree_default_splitfunc; + return; + } + tree->splitdata = extra; + tree->splitfunc = func; +} + +static int rtree_validate_node (rtree_const_handle tree, + const rtree_node * node, int depth, void * extra) +{ + const rtree_range * ranges[RTREE_CHILD_MAX]; + int i; + rtree_range check; + + assert (node); + + if (depth == tree->depth) + { + for (i=0; ientry[i] ? &node->entry[i]->range : 0); + } + else + { + for (i=0; ichild[i] ? &node->child[i]->range : 0); + } + + /* Check that node->range equals the extent of + * the ranges of the children */ + if (ranges[0]) + { + check = *ranges[0]; + for (i=1; irange)) + return TRUE; + + fprintf (stderr, "Error in node range!\n"); + assert (FALSE); + return FALSE; +} + +static int rtree_node_count_children (const rtree_node * node, int leaf) +{ + int i=0; + if (leaf) + { + for (i=0; ientry[i]) + break; + } + else + { + for (i=0; ichild[i]) + break; + } + return i; +} + + +/* + * Returns RTREE_CHILD_MAX if the child could not be found in the node */ +static inline int rtree_node_find_child (const rtree_node * n, const rtree_node * child) +{ + int i; + assert (n); + for (i=0; ichild[i]==child) + break; + } + return i; +} + + +static int rtree_validate_count (rtree_const_handle tree, const rtree_node * node, + int depth, void * extra) +{ + int * count = (int *) extra; + + assert (node); assert (extra); + + /* only interested in leaf nodes */ + if (depth != tree->depth) + return TRUE; + + *count += rtree_node_count_children (node, TRUE); + return TRUE; +} + +static int rtree_validate_parent (rtree_const_handle tree, const rtree_node * node, + int depth, void * extra) +{ + int pos; + + assert (node); + + if (!depth) + { + /* root node */ + assert (!node->parent); + return (!node->parent); + } + + pos = rtree_node_find_child (node->parent, node); + if (pos == RTREE_CHILD_MAX) + { + fprintf (stderr, "RTree: node->parent link incorrect!\n"); + assert (node->parent->child[pos] == node); + return FALSE; + } + + return TRUE; +} + +/* + * Validate tree structure + * - test node extent + */ +int rtree_check (rtree_const_handle tree) +{ + int count = 0; + + /* check extent for the nodes */ + if (!rtree_visit_nodes (tree, rtree_validate_node, 0)) + return FALSE; + + /* check child count consistency */ + rtree_visit_nodes (tree, rtree_validate_count, (void*)&count); + assert (count == tree->count); + if (count != tree->count) + { + fprintf (stderr, "RTree: tree->count not consistent with " + "entry count\n"); + return FALSE; + } + + /* check parent pointer */ + if (!rtree_visit_nodes (tree, rtree_validate_parent, 0)) + return FALSE; + + return TRUE; +} + + +/*************** node removal ******************/ + +static void rtree_node_remove_entry (rtree_node * node, int entry) +{ + int pos = entry; + + assert (node); + assert (node->entry[entry]); + + node->entry[entry] = 0; + + +#ifdef RTREE_LEAF_SORT + /***** INCORRECT !! [ last pointer twice in index ! ] *****/ + /* move up the remaining entries */ + for (pos = entry; posentry[pos] = node->entry[pos+1]; + if (!node->entry[pos]) + break; + } + assert (false); +#else + /* Find first zero pointer after the removed item */ + for (pos=entry+1; posentry[pos]) + break; + } + /* now pos = last_not_null_idx + 1; If equal to pos+1 -> no elements left */ + assert ((pos <= RTREE_CHILD_MAX) && pos > 0); + if (pos != entry+1) + { + node->entry[entry] = node->entry[pos-1]; + node->entry[pos-1] = 0; + } +#endif + + rtree_node_fix_extent (node, TRUE); +} + + +static void rtree_node_remove_child (rtree_node * node, int childnum) +{ + int pos = childnum; + + assert (node); + assert (node->child[childnum]); + + node->child[childnum] = 0; + +#ifdef RTREE_NODE_SORT + /* move up the remaining entries */ + for (pos = childnum; poschild[pos] = node->child[pos+1]; + if (!node->child[pos]) + break; + } + assert (false); +#else + /* put last child in removed slot*/ + for (pos=childnum+1; poschild[pos]) + break; + } + assert ((pos <= RTREE_CHILD_MAX) && pos > 0); + if (pos != childnum+1) + { + node->entry[childnum] = node->entry[pos-1]; + node->entry[pos-1] = 0; + } +#endif + + rtree_node_fix_extent (node, FALSE); +} + +static void rtree_condensetree (rtree_handle tree, rtree_node * node, int depth) +{ + int remove; + rtree_node * parent = 0; + int childcount; + + assert (node); + + parent = node->parent; + + /* check if node needs to be removed */ + childcount = rtree_node_count_children(node, depth==tree->depth); + + remove =(childcount < rtree_get_child_min(tree) ? 1 : 0) + && depth; /* cannot remove root node */ + + if (remove) + { + int childnum; /* child number of this child in the parent */ + + /* not enough children in the node, remove the node + * which means also removing something from the parent node + */ + + if (parent) + { + /* possible performance improvement: avoid the search in the parent + * node to find our pointer */ + + childnum = rtree_node_find_child(parent, node); + assert (childnum != RTREE_CHILD_MAX); + + rtree_node_remove_child (parent, childnum); + } + } + else + { + /* the node stays, just fix the extent */ + rtree_node_fix_extent (node, depth == tree->depth); + } + + /* if we remove or not, need to go up to the parent */ + if (parent) + { + assert (depth); + rtree_condensetree (tree, node->parent, depth-1); + } + + if (remove) + { + /* if we removed a node, insert its remaining children again at the same level */ + /* then free the node */ + int i; + + for (i=0; idepth) + { + if (!node->entry[i]) + break; + + /* reinserting leaf node */ + rtree_add_entry (tree, node->entry[i]); + } + else + { + rtree_node * insertpoint; + /* Reinserting a non-leaf node */ + if (!node->child[i]) + break; + + /* depth is the depth of this node */ + insertpoint = rtree_add_choosenode (tree->root, &node->child[i]->range, + 0, tree->depth, depth); + + rtree_add_node (tree, insertpoint, node->child[i]); + } + } + rtree_mem_free_node (&node); + } +} + +/* See if we can reduce the height of the tree */ +static void rtree_remove_checkroot (rtree_handle tree) +{ + rtree_node * tmp; + int childcount; + + + /* the root node has is a leaf node if the depth of the tree is 0 */ + childcount = rtree_node_count_children (tree->root, tree->depth == 0); + assert (childcount || !tree->depth); + + /* if the there is only one node we cannot remove is (depth=0) */ + if (childcount > 1 || !tree->depth) + return; + + tmp = tree->root->child[0]; + + assert (tmp); + assert (rtree_range_equals (&tree->root->range, + &tmp->range)); + + rtree_mem_free_node (&tree->root); + tree->root = tmp; + assert (tree->depth); + --tree->depth; + tree->root->parent = 0; +} + +int rtree_remove (rtree_handle tree, const rtree_range * range, + RTREE_DATA_TYPE * data) +{ + rtree_node * nodeptr = 0; + rtree_entry * entry = 0; + int entrynum = -1; + + /* try to find node first */ + rtree_find_internal (tree->root, range, 0, tree->depth, + (const rtree_node **) /* const cast */ &nodeptr, &entrynum); + + if (!nodeptr) + return FALSE; + + /* found, store data */ + entry = nodeptr->entry[entrynum]; + if (data) + *data = entry->data; + + /* call free function if there is one */ + if (tree->freefunc) + tree->freefunc (&entry->range, &entry->data, tree->freedata); + + /* remove entry pointer from node */ + rtree_node_remove_entry (nodeptr, entrynum); + + /* remove entry from tree */ + rtree_condensetree (tree, nodeptr, tree->depth); + + /* if the root has only one child decrease the height of the tree */ + rtree_remove_checkroot (tree); + + /* free entry memory */ + rtree_mem_free_entry (&entry); + + /* adjust item count */ + --tree->count; + + return TRUE; +} + +static rtree_entry * rtree_copy_entry (rtree_entry * entry, + rtree_callback_copy copy, void * extra) +{ + rtree_entry * newentry = rtree_mem_alloc_entry (&entry->range, + 0); + + if (copy) + copy (&entry->range, &entry->data, &newentry->data); + else + newentry->data = entry->data; + + return newentry; +} + +static rtree_node * rtree_copy_internal (const rtree_node * node, + int depth, int maxdepth, rtree_callback_copy copy, void * extra) +{ + int i; + rtree_node * newnode; + assert (node); + + newnode = rtree_mem_alloc_node (); + assert (newnode); + + newnode->range = node->range; + newnode->parent = 0; + + if (depth == maxdepth) + { + /* copy entries */ + for (i=0; ientry[i]) + newnode->entry[i] = rtree_copy_entry (node->entry[i], + copy, extra); + else + newnode->entry[i] = 0; + /* entries have no parent pointer */ + } + } + else + { + /* copy children */ + for (i=0; ichild[i]) + { + newnode->child[i] = rtree_copy_internal (node->child[i], + depth+1, maxdepth, copy, extra); + /* fix parent pointer */ + newnode->child[i]->parent = newnode; + } + else + newnode->child[i] = 0; + } + } + return newnode; +} + +rtree_handle rtree_copy (rtree_const_handle tree, rtree_callback_copy copy, + void * extra) +{ + rtree_handle newtree = rtree_mem_alloc_tree (); + *newtree = *tree; + + newtree->root = rtree_copy_internal (tree->root, 0, tree->depth, copy, extra); + assert (0 == tree->root->parent); + return newtree; +} + +/*========================================================================= + * RTree range helper functions * + *=========================================================================*/ +static inline int rtree_range_empty (const rtree_range * r) +{ + return (r->start >= r->stop); +} + +static inline RTREE_RANGE_TYPE rtree_range_size (const rtree_range * r) +{ + assert (r->start <= r->stop); + return (r->stop - r->start); +} + +/* calculate how much r1 would extend when adding r2 to it */ +static inline RTREE_RANGE_TYPE rtree_range_calc_extension (const rtree_range * r1, + const rtree_range * r2) +{ + rtree_range d; + + rtree_range_extent (r1, r2, &d); + return rtree_range_size(&d) - rtree_range_size(r1); +} + +static inline RTREE_RANGE_TYPE rtree_range_type_min + (RTREE_RANGE_TYPE a, RTREE_RANGE_TYPE b) +{ + return (a < b ? a : b); +} + +static inline RTREE_RANGE_TYPE rtree_range_type_max + (RTREE_RANGE_TYPE a, RTREE_RANGE_TYPE b) +{ + return (a > b ? a : b); +} + + +/* Switch pointers if needed so that range1 comes first */ +static inline void rtree_range_order (const rtree_range * * range1, + const rtree_range * * range2) +{ + if ((*range1)->start > (*range2)->start) + { + const rtree_range * tmp = *range1; + *range1 = *range2; *range2 = tmp; + } +} + +static inline int rtree_range_has_overlap (const rtree_range * range1, + const rtree_range * range2) +{ + return (rtree_range_type_max(range1->start, range2->start) + < rtree_range_type_min(range1->stop, range2->stop)); +} + +static inline int rtree_range_shared (const rtree_range * range1, + const rtree_range * range2, rtree_range * dest) +{ + dest->start = rtree_range_type_max(range1->start, range2->start); + dest->stop = rtree_range_type_min(range1->stop, range2->stop); + return (dest->start < dest->stop); +} + +static inline void rtree_range_extent (const rtree_range * range1, + const rtree_range * range2, rtree_range * dest) +{ + dest->start = rtree_range_type_min(range1->start,range2->start); + dest->stop = rtree_range_type_max(range1->stop, range2->stop); +} + +static inline void rtree_range_extend (rtree_range * range1, + const rtree_range * range2) +{ + range1->start = rtree_range_type_min(range1->start, range2->start); + range1->stop = rtree_range_type_max(range1->stop, range2->stop); +} + + +/* check if the ranges are equal */ +static inline int rtree_range_equals (const rtree_range * r1, const rtree_range * r2) +{ + assert (r1->start <= r1->stop); + if (r1->start == r1->stop) + { + return (r2->start == r2->stop); + } + + return ((r1->start == r2->start) && (r1->stop == r2->stop)); +} + +/* check if the first range contains all of the second range */ +static inline int rtree_range_contains (const rtree_range * r1, const rtree_range * r2) +{ + /* figure out later if two empty ranges contain eachother... */ + assert ((r1->start != r1->stop) || (r2->start != r2->stop)); + return ((r1->start <= r2->start) && (r1->stop >= r2->stop)); +} + + +/**************************************************************************** + * Iterator functions * + ****************************************************************************/ +static inline void rtree_iterator_validate (rtree_iterator * iter) +{ + assert (iter); + assert (iter->tree); + assert (iter->createdepth == iter->tree->depth); +} + +/* fast recreate iterator after tree modification */ + +void rtree_iterator_init (rtree_handle tree, rtree_iterator * iter) +{ + iter->childnum[0] = 0; + iter->depth=0; + iter->node=tree->root; + iter->tree = tree; + rtree_iterator_validate (iter); +} + +void rtree_iterator_update (rtree_iterator * iter) +{ + assert (iter); + if (iter->createdepth != iter->tree->depth) + { + /* realloc faster? */ + free(iter->childnum); + iter->createdepth = iter->tree->depth; + iter->childnum=(int *) malloc (sizeof(int)*iter->createdepth); + } + + rtree_iterator_init (iter->tree, iter); +} + +void rtree_iterator_free (rtree_iterator_handle * iter) +{ + assert (iter); + free ((*iter)->childnum); + rtree_mem_free_iterator (iter); +} + +rtree_iterator_handle rtree_iterator_create (rtree_handle tree) +{ + rtree_iterator * ret = rtree_mem_alloc_iterator (); + ret->createdepth = tree->depth; + ret->childnum = (int*) malloc(sizeof(int)*ret->createdepth); + rtree_iterator_init (tree, ret); + rtree_iterator_forward(ret); + return ret; +} + +void rtree_iterator_forward (rtree_iterator_handle iter) +{ + /* if we're down and have entries left in the current node, + * advance child */ + if (iter->depth == iter->tree->depth) + { + assert (iter->node); + ++iter->childnum[iter->depth]; + if (iter->childnum[iter->depth] < + rtree_get_child_max(iter->tree) && + iter->node->entry[iter->childnum[iter->depth]]) + return; + } + /* go up if needed, then go down the tree */ +} + +void rtree_iterator_backward (rtree_iterator_handle iter) +{ + assert (FALSE); +} + + diff --git a/src/mpi/romio/adio/common/strfns.c b/src/mpi/romio/adio/common/strfns.c index 14cf81c9ad1..f782fa84e3e 100644 --- a/src/mpi/romio/adio/common/strfns.c +++ b/src/mpi/romio/adio/common/strfns.c @@ -6,6 +6,7 @@ */ #include "adio.h" +#include /* * Below are the "safe" versions of the various string and printf @@ -65,6 +66,23 @@ int ADIOI_Strncpy( char *dest, const char *src, size_t n ) debugging version */ return 1; } +/* + * Convert string to lowercase + * Converts inplace, returns pointer to str + */ +char * ADIOI_Strlower (char * str) +{ + if (!str) + return str; + while (*str) + { + *str = tolower(*str); + str++; + } + return str; +} + + /*@ ADIOI_Strdup - Duplicate a string diff --git a/src/mpi/romio/adio/common/typehelper.c b/src/mpi/romio/adio/common/typehelper.c new file mode 100644 index 00000000000..430a318d037 --- /dev/null +++ b/src/mpi/romio/adio/common/typehelper.c @@ -0,0 +1,510 @@ +#include +#include +#include "typehelper.h" +#include "adio.h" +#include "adio_extern.h" + + +/* + * Calculate every continuous segment in the access + */ +void typehelper_calcaccess (MPI_Datatype etype, + MPI_Datatype ftype, ADIO_Offset disp, ADIO_Offset offset, + int writesize, const DatatypeHandler * cb, void * userdata) +{ + int ftypecontig; + int ftype_size; + MPI_Aint ftype_extent; + int etype_size; + int ftypecount; /* number of complete ftypes in data */ + ADIOI_Flatlist_node * flat_buf; + int remainder; /* amount of bytes in last filetype */ + int i; + int j; + ADIO_Offset fileofs; + int status; + + + /* ftype_size and ftype_extent, etype_size are + * calculated when setting the view */ + MPI_Type_size (etype, &etype_size); + + offset *= etype_size; + offset += disp; + + ADIOI_Datatype_iscontig (ftype, &ftypecontig); + + if (cb->start) + cb->start (userdata); + + if (ftypecontig) + { + /* easy case */ + + if (cb->startfragment) + cb->startfragment (offset, writesize, userdata); + + status = cb->processdata (0, writesize, offset, userdata); + + if (cb->stopfragment) + cb->stopfragment (status, userdata); + if (cb->stop) + cb->stop (status, userdata); + return ; + } + + MPI_Type_extent (ftype, &ftype_extent); + MPI_Type_size (ftype, &ftype_size); + + /* number of complete filetypes to write */ + ftypecount = (writesize) / ftype_size; + /* number of bytes of the partial filetype */ + remainder = (writesize) % ftype_size; + + flat_buf = ADIOI_Flatten_and_find(ftype); + + assert (flat_buf); + + + /* increase offset by leading hole in first filetype */ + if (ftypecount) + offset += flat_buf->indices[0]; + + + /* we have ftype count complete filetypes */ + for (i=0; icount; ++j) + { + fileofs = offset + flat_buf->indices[j] - flat_buf->indices[0] + + (ftype_extent * i); + if (cb->startfragment) + cb->startfragment (fileofs, flat_buf->blocklens[j], userdata); + + status = cb->processdata (0, flat_buf->blocklens[j], fileofs, userdata); + + if (cb->stopfragment) + cb->stopfragment (status, userdata); + + if (!status) + break; + } + + if (!status) + break; + + } + + /* add extent for count filetypes */ + offset += ftypecount * ftype_extent; + + /* do last filetype */ + i = 0; + while (remainder) + { + ADIO_Offset increment ; + + assert (i < flat_buf->count); + /* is the final byte in this segment? */ + + increment = (remainder < flat_buf->blocklens[i] ? + remainder : flat_buf->blocklens[i]); + + /* add displacement of segment, relative to the beginning of the type */ + offset += flat_buf->indices[i] - flat_buf->indices[0]; + + if (cb->startfragment) + cb->startfragment (offset, increment, userdata); + + status = cb->processdata (0, increment, offset, userdata); + + if (cb->stopfragment) + cb->stopfragment (status, userdata); + + ++i; + assert (remainder >= increment); + remainder -= increment; + if (!status) + break; + } + + if (cb->stop) + cb->stop (status, userdata); +} + +/* + * Given the parameters of a file view (the teype, ftype, displacement), an + * offset, and an amount of data to write, determine the first (start) and last + * (stop) bytes touched by the request. + * this is looking at the file type (see etype, ftype and other file view + * parameters), so take into account tiling, too. + * Does not work as expected for LB/UB modified types; + * The code will take the LB/UB as a 0 byte write and take it into account + * when determining the first and last write position + */ +void typehelper_calcrange (MPI_Datatype etype, + MPI_Datatype ftype, ADIO_Offset disp, ADIO_Offset offset, + int writesize, + ADIO_Offset * start, ADIO_Offset * stop) +{ + int ftype_size; + MPI_Aint ftype_lb, ftype_extent; + int etype_size; + int ftypecount; /* number of complete ftypes in data */ + ADIOI_Flatlist_node * flat_buf; + int remainder; /* amount of bytes in last filetype */ + int i; + ADIO_Offset last_byte; /* last byte of type, ignoring UB marker */ + + /* ftype_size and ftype_extent, etype_size are + * calculated when setting the view */ + MPI_Type_get_extent (ftype, &ftype_lb, &ftype_extent); + MPI_Type_size (ftype, &ftype_size); + MPI_Type_size (etype, &etype_size); + + /* offset is a count of etype */ + offset *= etype_size; + /* but file view displacement is absolute bytes */ + offset += disp; + + + /* instead of special-casing contiguous types, they should follow the + * noncontiguous code path */ + /* noncontiguous types a little trickier than contig. Take into account + * tiling of the type (count of type times extent), but also take into + * account the UB markers. Additional complication: the set_view code + * already skips over any initial lower bound marker, so 'offset' could come + * into this routine with a non-zero value */ + + /* number of complete filetypes to write */ + ftypecount = (writesize) / ftype_size; + /* number of bytes of the partial filetype */ + remainder = (writesize) % ftype_size; + + flat_buf = ADIOI_Flatten_and_find(ftype); + + assert (flat_buf); + + if (flat_buf->blocklens[flat_buf->count-1] == 0) + /* the '-2' seems odd at first glance, but if there is an upper bound + * marker then there must be at least two items in the flattened + * representation */ + last_byte = flat_buf->indices[flat_buf->count -2] + + flat_buf->blocklens[flat_buf->count -2]; + else + last_byte = flat_buf->indices[flat_buf->count -1] + + flat_buf->blocklens[flat_buf->count -1]; + + /* offset now indicates first byte to write */ + *start = offset; + + /* before tiling these filetypes, need to (maybe) wind back the offset to + * account for the lower bound (which set_file_view skipped over) */ + if (flat_buf->blocklens[0] == 0) + offset -= flat_buf->indices[1]; + + /* add extent for count filetypes. these are the complete file types. + * We'll do any partial file types below */ + offset += ftypecount * ftype_extent; + + if (remainder == 0) { + /* no partial types to worry about, but need to trim off the upper + * bound (if exists), since no type will tile after this one */ + offset -= (ftype_extent - ftype_lb) - last_byte; + } else { + for (i=0; remainder > 0; i++) { + remainder -= flat_buf->blocklens[i]; + offset += flat_buf->indices[i] + flat_buf->blocklens[i]; + } + } + *stop = offset; +} + +static int typehelper_decodememtype_contiguous + (void * buf, int count, MPI_Datatype memtype, + const DatatypeHandler * callback, void * data) +{ + int size; + int status; + MPI_Type_size (memtype, &size); + + /* the easy case */ + if (callback->start) + callback->start (data); + + if (callback->startfragment) + callback->startfragment (0, size, data); + + status = callback->processdata (buf, size*count, + 0, data); + + if (callback->stopfragment) + callback->stopfragment (status, data); + + if (callback->stop) + callback->stop (status, data); + + return status ; +} + +/* stream the datatype contents (meant for memory datatypes) */ +int typehelper_decodememtype (const void * buf, int count, + MPI_Datatype memtype, + const DatatypeHandler * callback, void * data) +{ + ADIOI_Flatlist_node * flat_buf; + int continuous; + int status; + int i; + int j; + MPI_Aint extent; + char * ptr; + MPI_Offset bytecount ; + + /* check for easy case */ + ADIOI_Datatype_iscontig (memtype, &continuous); + + if (continuous) + return typehelper_decodememtype_contiguous ((char *)buf, count, + memtype, callback, data); + + /* for now use the flatlist; later on use the dataloop code */ + MPI_Type_extent (memtype, &extent); + + /* find flattened version */ + flat_buf = ADIOI_Flatten_and_find(memtype); + + if (callback->start) + callback->start (data); + + ptr = (char *) buf; + bytecount = 0; + for (i=0; icount; ++j) + { + ptr = ((char*)buf) + flat_buf->indices[j] + + i * extent; + if (callback->startfragment) + callback->startfragment (bytecount, + flat_buf->blocklens[j], data); + + status = callback->processdata (ptr, flat_buf->blocklens[j], + bytecount, data); + + bytecount += flat_buf->blocklens[j]; + + if (callback->stopfragment) + callback->stopfragment (status, data); + } + } + + if (callback->stop) + callback->stop (status, data); + + return status; +} + + +static inline MPI_Offset min_offset (MPI_Offset a, MPI_Offset b) +{ + return (a < b ? a : b); +} + +/* offset is in bytes, displacement is in bytes */ +int typehelper_processtypes (MPI_Datatype memtype, void * buf, int count, + MPI_Datatype filetype, MPI_Datatype etype, int offset, int displacement, + const DatatypeHandler * callback, void * data) +{ + int memtypecontig; + int filetypecontig; + int transfersize; + int status = 1; /* return code */ + + /* check for null operation */ + MPI_Type_size (memtype, &transfersize); + transfersize *= count; + if (0 == transfersize) + return 0; + + + ADIOI_Datatype_iscontig (memtype, &memtypecontig); + ADIOI_Datatype_iscontig (filetype, &filetypecontig); + + if (callback->start) + callback->start (data); + + if (memtypecontig && filetypecontig) + { + int size; + + /* calculate byte size of the operation */ + MPI_Type_size (memtype, &size); + size *= count; + + if (callback->startfragment) + callback->startfragment (offset + displacement, size, data); + + status = callback->processdata (buf, size, offset + displacement, data); + + if (callback->stopfragment) + callback->stopfragment (status, data); + } + else + { + MPI_Aint filetype_extent; + int filetype_size; + int etype_size; + char * dataptr; + int i; + MPI_Offset writestart = offset + displacement; + int todo = transfersize; /* amount of data in bytes */ + + ADIOI_Flatlist_node *flat_file; + + /* no noncontig memtypes for now */ + assert (memtypecontig); + + /* contiguous in mem, noncont in file */ + /* flatten filetype */ + + MPI_Type_extent (filetype, &filetype_extent); + MPI_Type_size (filetype, &filetype_size); + MPI_Type_size (etype, &etype_size); + + flat_file = ADIOI_Flatten_and_find(filetype); + assert (flat_file); + + /* now we have flat_file->blocklens and flat_file->indices + * (flat_file->count) + * blocklens is in bytes + * indices is offset from start of the type in bytes + * NOTE: start and interaction of type LB ? + * */ + + dataptr = buf; + todo = transfersize; + while (todo) + { + for (i=0; todo && icount; ++i) + { + /* size of cont part in bytes */ + int itemsize = + min_offset(flat_file->blocklens[i], todo); + + /* absolute start of write region in file in bytes */ + int fileofs = writestart + flat_file->indices[i]; + + + if (callback->startfragment) + callback->startfragment (fileofs, itemsize, data); + + status = callback->processdata (dataptr, itemsize, fileofs, data); + + if (callback->stopfragment) + callback->stopfragment (status, data); + + /* advance memory pointer */ + dataptr += itemsize; + todo -= itemsize; + + if (!status) + break; + } + + if (!status) + break; + + writestart += filetype_extent; + } + } + + if (callback->stop) + callback->stop (status, data); + + return status; +} + + +/****************************************** debug code **********************/ +static int dumpfunc (void * membuf, int size, ADIO_Offset fileoffset, void * data) +{ + fprintf (stderr, "typehelper_processtype_debug: mem %p size %u going to file @ %lld\n", + membuf, size, fileoffset); + return 1; +} + +static void startfunc (void * data) +{ + fprintf (stderr, "start of processing\n"); +} + +static void stopfunc (int status, void * data) +{ + fprintf (stderr, "stop of processing\n"); +} + +static void startfragment (ADIO_Offset ofs, ADIO_Offset size, void * data) +{ + fprintf (stderr, "Start fragment of size %llu at %llu\n", size, ofs); +} + +static void stopfragment (int status, void * data) +{ + fprintf (stderr, "stop fragment\n"); +} +void typehelper_processtypes_debug (MPI_Datatype memtype, void * buf, int count, + MPI_Datatype filetype, MPI_Datatype etype, int offset,int displacement) +{ + DatatypeHandler handler; + handler.processdata = dumpfunc; + handler.start = startfunc; + handler.stop = stopfunc; + handler.startfragment = startfragment; + handler.stopfragment = stopfragment; + typehelper_processtypes (memtype, buf, count, filetype, etype, + offset, displacement, &handler, 0); +} +/****************************************************************************/ + +void typehelper_processoperation (MPI_Datatype memtype, void * buf, int count, + ADIO_File fd, MPI_Offset offset, int file_ptr_type, + const DatatypeHandler * callback, void * data, int debug) +{ + int memcontig; + int filecontig; + MPI_Count transfersize; + + /* performance check */ + MPI_Type_size_x (memtype, &transfersize); + if (0 == transfersize * count) + return; + + ADIOI_Datatype_iscontig (memtype, &memcontig); + ADIOI_Datatype_iscontig (fd->filetype, &filecontig); + + /* calculate offset, in bytes */ + switch (file_ptr_type) + { + case ADIO_EXPLICIT_OFFSET: + assert (offset >= 0); + offset *= fd->etype_size; + break; + case ADIO_INDIVIDUAL: + offset = fd->fp_ind; + /* fh->fp_ind is already in bytes */ + break; + default: + /* shared filepointer is handled in higher level */ + assert (0 /* UNHANDLED CASE */); + break; + }; + + if (debug) + typehelper_processtypes_debug (memtype, buf, count, + fd->filetype, fd->etype, offset, fd->disp); + else + typehelper_processtypes (memtype, buf, count, + fd->filetype, fd->etype, offset, fd->disp, + callback, data); +} diff --git a/src/mpi/romio/adio/common/writering.c b/src/mpi/romio/adio/common/writering.c new file mode 100644 index 00000000000..995accb5859 --- /dev/null +++ b/src/mpi/romio/adio/common/writering.c @@ -0,0 +1,1167 @@ +#include +#include +#include +#include + +#ifdef HAVE_EFENCE +#include +#endif + +#include "writering.h" + + +#define LOCK_FREE 0 +#define LOCK_WRITE_ACTIVE 1 +#define LOCK_READ_ACTIVE 2 + +/** + * Helper for writecombining and readahead + * + */ + +typedef struct +{ + char * data; /* data pointer */ + WRR_OFFSET startofs; /* Start offset in file */ + unsigned int dirty; /* Block needs to be written */ + unsigned int locked; /* write or read is in progress */ + unsigned int used; /* How much data is in the block */ +} writering_block; + +struct writering_instance +{ + writering_ops ops; /* external helper operations */ + void * ops_data; /* extra data for external helper */ + + unsigned int maxblockcount; /* max block count */ + unsigned int blocksize; /* size in bytes of a block */ + unsigned int blockcount; /* number of blocks in use */ + + writering_block * blocks; /* Pointer to array of pointers to blocks */ + + int write_active; /* if a write is in progress */ + int read_active; /* if a read is in progress */ + unsigned int write_size; /* Size of non-blocking write */ + unsigned int read_size; /* Size of active non-blocking read request */ + unsigned int write_ready; /* Number of full dirty blocks in queue */ + + int readmode; /* If reading is allowed */ + int writemode; /* If writing is allowed */ + + int sync; /* to turn off all buffering */ + + int open; /* for lazy opening */ + + unsigned int lastusedblock; /* last used block (reading/writing) */ + WRR_OFFSET filesize; /* Size of file (logical) */ + WRR_OFFSET lastread; /* Offset of last read operation */ + unsigned int readops; /* Number of consecutive ordered read ops */ + + int debug; /* do extra checking */ +}; + + +typedef struct writering_instance writering_instance; + + +/* =========================================================================== */ + +/* forwards */ +void writering_progress (writering_handle handle); + + +/* ======================================================================= */ + +#define WRR_MIN(a,b) ( (a) < (b) ? (a) : (b) ) +#define WRR_MAX(a,b) ( (a) > (b) ? (a) : (b) ) + +static inline void writering_assert (int t) +{ + if (t) + return; + fprintf (stderr, "Writering: assertion failed!\n"); + *(char*) 0 = 10; +} + +/* ======================================================================= */ + +/* + * Make sure the helper is active so that we can read/write + */ +inline static void writering_ensureopen (writering_handle handle) +{ + WRR_OFFSET realfilesize; + if (handle->open) + return; + + handle->ops.init (handle->ops_data, handle->readmode, handle->writemode); + handle->open = 1; + + handle->ops.getsize (handle->ops_data, &realfilesize); + handle->filesize = WRR_MAX (handle->filesize, realfilesize); +} + + +/* ======================================================================= */ +/* ======================================================================= */ +/* ======================================================================= */ + +static void writering_write_immediate (writering_handle handle, + WRR_OFFSET ofs, const void * data, unsigned int datasize) +{ + unsigned int written; + + /* cannot do this if a write is already active */ + assert (handle->write_active < 0); + + /*fprintf (stderr, "immediate write @ %lu, %lu bytes\n", + (long unsigned) ofs, (long unsigned) datasize); */ + writering_ensureopen (handle); + handle->ops.start_write (handle->ops_data, ofs, + data, datasize); + handle->ops.wait_write (handle->ops_data, &written); + + writering_assert (written == datasize); +} + +/* ======================================================================= */ +/* ======================================================================= */ +/* ======================================================================= */ + +/* + * Read non-blocking + */ +static int writering_read_immediate (writering_handle handle, + WRR_OFFSET ofs, void * data, unsigned int size) +{ + unsigned int read; + + /* can't have any nonblocking read going on */ + assert (handle->read_active < 0); + + + writering_ensureopen (handle); + handle->ops.start_read (handle->ops_data, ofs, data, size); + handle->ops.wait_read (handle->ops_data, &read); + return read; +} + + +static int writering_validate_overlap (const writering_block * b1, + const writering_block * b2) +{ + /* order first block in b1 */ + if (b1->startofs > b2->startofs) + { + const writering_block * tmp = b1; + b1 = b2; + b2 = tmp; + } + + /* b1 is first block */ + return (b2->startofs < (b1->startofs + b1->used)); +} + +/* validate internal data structure */ +static void writering_validate (writering_consthandle handle) +{ + unsigned int i,j ; + assert (handle->blockcount <= handle->maxblockcount); + + for (i=0; iblockcount; ++i) + { + const writering_block * ptr = &handle->blocks[i]; + + if (!ptr->used) + continue; + + assert (ptr->data); + assert (ptr->used <= handle->blocksize); + + assert (ptr->startofs + ptr->used <= handle->filesize); + + /* check non-overlapping */ + for (j=i+1; jblockcount; ++j) + { + const writering_block * ptr2 = &handle->blocks[j]; + assert (!ptr2->used || + !writering_validate_overlap (ptr, ptr2)); + } + } +} + +/* ======================================================================= */ +/* == Non-blocking reads/writes ========================================== */ +/* ======================================================================= */ + +/* Find a block suitable for non-blocking write */ +/* Return block number or -1 if none found */ +static int writering_write_nonblock_select + (writering_consthandle handle) +{ + int minblock = -1; + int i; + WRR_OFFSET minofs; + + /* go over the block list, and only consider full dirty blocks; + * Select the earliest in the fule among all full blocks, and return that one. */ + + if (!handle->write_ready) + return -1; + + if (handle->write_active >= 0) + return -1; + + for (i=0; iblockcount; ++i) + { + if (!handle->blocks[i].used || !handle->blocks[i].dirty || + handle->blocks[i].locked != LOCK_FREE) + { + continue; + } + + if (minblock < 0 || (handle->blocks[i].startofs < minofs)) + { + minblock = i; + minofs = handle->blocks[i].startofs; + } + } + assert (minblock == -1 || (minblock >= 0 && minblock < handle->blockcount)); + return minblock; +} + + +/* return true if a non-blocking write is going on */ +static inline int writering_write_nonblock_active + (writering_consthandle handle) +{ + return handle->write_active >= 0; +} + +/* return true if a non-blocking read is going on */ +static inline int writering_read_nonblock_active + (writering_consthandle handle) +{ + return handle->read_active; +} + +/* Try to start nonblocking write */ +static void writering_write_nonblock_start (writering_handle handle) +{ + int block = writering_write_nonblock_select (handle); + writering_block * blockptr; + + if (block < 0) + return; + + blockptr = &handle->blocks[block]; + + assert (handle->write_ready); + --handle->write_ready; + + assert (blockptr->locked == LOCK_FREE); + blockptr->locked = LOCK_WRITE_ACTIVE; + + writering_ensureopen (handle); + handle->ops.start_write (handle->ops_data, blockptr->startofs, + blockptr->data, blockptr->used); + + handle->write_active = block; + handle->write_size = blockptr->used; +} + +static int writering_read_nonblock_test (writering_handle handle) +{ + assert (handle->read_active >= 0); + assert (0); + return 0; +} + +static void writering_read_nonblock_finished (writering_handle handle, + unsigned int size) +{ + assert (handle->read_active >= 0); + assert (0); +} + +static void writering_write_nonblock_finished (writering_handle handle) +{ + writering_block * block; + assert (handle->write_active >= 0); + + block = &handle->blocks[handle->write_active]; + + block->dirty = 0; + block->locked = LOCK_FREE; + handle->write_active = -1; +} + +static int writering_write_nonblock_test (writering_handle handle) +{ + unsigned int written; + assert (handle->write_active >= 0); + + if (handle->ops.test_write (handle->ops_data, &written)) + { + // If this fails, there was an error writing the data + assert (written == handle->write_size); + writering_write_nonblock_finished (handle); + return 1; + } + else + { + return 0; + } +} + +static void writering_read_nonblock_wait (writering_handle handle) +{ + assert (handle->read_active >= 0); + unsigned int read; + handle->ops.wait_read (handle->ops_data, &read); + writering_read_nonblock_finished (handle, read); +} + +static void writering_write_nonblock_wait (writering_handle handle) +{ + assert (handle->write_active >= 0); + unsigned int written; + handle->ops.wait_write (handle->ops_data, &written); + assert (written == handle->write_size); + writering_write_nonblock_finished (handle); + +} + + +/* ======================================================================= */ +/* ======================================================================= */ +/* ======================================================================= */ + + +/* + * Constructor + */ +writering_handle writering_create (int blocksize, int maxblockcount, + const writering_ops * ops, void * opsdata, int read, int write) +{ + writering_instance * data = + (writering_instance *) malloc (sizeof(writering_instance)); + + data->ops = *ops; + data->ops_data = opsdata; + + data->maxblockcount = maxblockcount; + data->blocksize = blocksize; + + data->blockcount = 0; + + data->blocks = (writering_block*) malloc (sizeof(writering_block)*data->maxblockcount); + + data->open = 0; + data->sync = 0; + + /* reading/writing is allowed if we have 1 or more blocks allocated */ + data->writemode = write; + data->readmode = read; + + data->write_active = -1; + data->read_active = -1; + data->write_ready = 0; + + data->filesize = 0; + data->lastread = 0; + data->readops = 0; + + data->lastusedblock = 0; + data->debug = 0; + + return data; +} + +/* ==========================================================================*/ +/* ==========================================================================*/ +/* ==========================================================================*/ + +void writering_setdebug (writering_handle handle, int debug) +{ + handle->debug = debug; +} + +void writering_setsync (writering_handle handle, int sync) +{ + /* If going to sync mode, flush writes/reads first */ + if (sync) + { + if (handle->writemode) + writering_write_flush (handle); + if (handle->readmode) + writering_read_flush (handle); + /* no point in keeping unused memory */ + writering_reducemem (handle); + } + handle->sync = sync; +} + +/* ==========================================================================*/ +/* ==========================================================================*/ +/* ==========================================================================*/ + +void writering_free (writering_handle * handle) +{ + assert (handle); + assert (*handle); + + /* There shouldn't be any more read/write operations, + * so reducemem should be able to free all memory */ + writering_write_flush (*handle); + writering_reducemem (*handle); + assert (0 == (*handle)->blockcount); + free ((*handle)->blocks); + + /* Close the helper */ + if ((*handle)->open) + { + (*handle)->open = 0; + (*handle)->ops.done ((*handle)->ops_data); + } + + free (*handle); + + *handle = 0; +} + +/* ==========================================================================*/ +/* ==========================================================================*/ +/* ==========================================================================*/ + +/* Invalidate all cached reads */ +void writering_read_flush (writering_handle handle) +{ + int i; + assert (handle); + + /* Flush doesn't affect the logical write position, + * only the real fileposition */ + + /* if there is nothing in memory we cannot flush anything */ + if (!handle->blockcount) + return; + + /* if there are no reads pending, skip waiting */ + if (handle->read_active < 0) + return; + + /* Wait until pending read is done */ + writering_read_nonblock_wait (handle); + + /* Clear all read buffers */ + for (i=0; iblockcount; ++i) + { + /* Clear all non-dirty blocks */ + if (handle->blocks[i].used && !handle->blocks[i].dirty) + handle->blocks[i].used = 0; + } +} + +/*=========================================================================*/ +/*=========================================================================*/ +/*=========================================================================*/ + +/* Flush all buffered writes; Data becomes available as read buffer */ +void writering_write_flush (writering_handle handle) +{ + int i; + + if (!handle->blockcount) + return; + + /* Wait for pending write to complete */ + if (handle->write_active >= 0) + writering_write_nonblock_wait (handle); + + for (i=0; iblockcount; ++i) + { + if (handle->blocks[i].used && handle->blocks[i].dirty) + { + /* Do nonblocking write */ + writering_write_immediate (handle, handle->blocks[i].startofs, + handle->blocks[i].data, handle->blocks[i].used); + /* Clear dirty flag */ + handle->blocks[i].dirty = 0; + } + } +} + +/*=========================================================================*/ +/*=========================================================================*/ +/*=========================================================================*/ + +void writering_getsize (writering_handle handle, WRR_OFFSET * ofs) +{ + /* get filesize */ + writering_ensureopen (handle); + *ofs = handle->filesize; +} + +/*=========================================================================*/ +/*=========================================================================*/ +/*=========================================================================*/ + +void writering_progress (writering_handle handle) +{ + /* Test for completion of a pending read */ + writering_read_nonblock_test (handle); + + /* test for completion of pending write; + * If no more active writes, try to flush the first full dirty block */ + writering_write_nonblock_test (handle); + if (!handle->write_active) + writering_write_nonblock_start (handle); +} + +/*=========================================================================*/ +/*=========================================================================*/ +/*=========================================================================*/ + +static inline int writering_blockcontains (writering_handle handle, + unsigned int blocknum, WRR_OFFSET ofs) +{ + return (handle->blocks[blocknum].startofs <= ofs && + (handle->blocks[blocknum].startofs+handle->blocksize) > ofs); +} + +/* find block that contains this offset; -1 if not found; + Blocks that have used == 0 are considered unallocated blocks */ +static inline int writering_findblock (writering_handle handle, WRR_OFFSET ofs) +{ + int i; + + if (handle->lastusedblock >= 0 + && handle->lastusedblock < handle->blockcount) + { + /* check last block */ + if (writering_blockcontains (handle, handle->lastusedblock, ofs) + && handle->blocks[handle->lastusedblock].used) + return handle->lastusedblock; + } + + /* not cached, search blocks */ + for (i=0; iblockcount; ++i) + { + if (handle->blocks[i].used && writering_blockcontains (handle, i, ofs)) + { + handle->lastusedblock = i; + return i; + } + } + return -1; +} + + +/* First select try to select a block before lastread (preferably a non-dirty + * non-locked one); + * If this fails, select the last block in file order */ +unsigned int writering_reclaimblock_readmode (writering_handle handle) +{ + unsigned int i; + int ret = -1; + unsigned int maxofs = 0; + WRR_OFFSET maxstartofs = handle->blocks[0].startofs; + int dirtyret = -1; + WRR_OFFSET startofs; + WRR_OFFSET dirtystartofs; + /* Try to reclaim a block before readofs */ + for (i=0; iblockcount; ++i) + { + writering_block * ptr = &handle->blocks[i]; + + /* unused blocks should have been selected earlier on */ + assert (ptr->used); + + /* find last block in file order */ + if (ptr->startofs > maxstartofs) + { + maxofs = i; + maxstartofs = ptr->startofs; + } + + /* Find first non-dirty block in file order before lastread */ + if ((ptr->startofs + ptr->used) < handle->lastread) + { + if (ret < 0 || ptr->startofs < startofs) + { + ret = i; + startofs = ptr->startofs; + } + if (dirtyret < 0 || ptr->startofs < dirtystartofs) + { + dirtyret = i; + dirtystartofs = ptr->startofs; + } + } + + } + + /* try non-dirty/locked block before lastread */ + if (ret >= 0) + return ret; + + /* try dirty/locked block before lastread */ + if (dirtyret >= 0) + return dirtyret; + + /* choose block with highest startofs */ + return maxofs; +} + + +/* + * In writemode we are only worried about not blocking the writer; + * For now, just reuse any non-dirty buffer (preferring the smallest one); + * If there are none, write the buffer with the most data. + */ +unsigned int writering_reclaimblock_writemode (writering_handle handle) +{ + int clean = -1; + int full = -1; + unsigned int fullsize; + unsigned int cleansize; + unsigned int i; + + for (i=0; iblockcount; ++i) + { + writering_block * ptr = &handle->blocks[i]; + + /* Empty block :-) */ + if (!ptr->used) + return i; + + if (!ptr->dirty) + { + if (clean < 0 || ptr->used < cleansize) + { + clean = i; + cleansize = ptr->used; + } + } + else + { + if (full < 0 || ptr->used > fullsize) + { + full = i; + fullsize = ptr->used; + } + } + } + if (clean >= 0) + return clean; + assert (full >= 0); + return full; +} + + +/* + * Return a free block; + * Either reclaim a used block, or (preferably) create a new one. + * + * We use readops as an indication of what phase we're in; + * If readops > 0 we're most likely sequentially reading through the file. + */ +static unsigned int writering_reclaimblock (writering_handle handle) +{ + int ret = -1; + writering_block * ptr; + int i; + + /* Try to add a new block */ + if (handle->blockcount < handle->maxblockcount) + { + unsigned int ret = handle->blockcount++; + writering_block * ptr = &handle->blocks[ret]; + ptr->used = 0; + ptr->data = (char*) malloc (handle->blocksize); + ptr->locked = LOCK_FREE; + ptr->dirty = 0; + ptr->startofs = (WRR_OFFSET) -1; + assert (ptr->data); + return ret; + } + + /* Try to find an unused block */ + for (i=0; iblockcount; ++i) + { + if (!handle->blocks[i].used) + { + ret = i; + break; + } + } + + if (ret < 0) + { + if (handle->readops) + ret= writering_reclaimblock_readmode (handle); + else + ret= writering_reclaimblock_writemode (handle); + } + + /* we have decided on a block, clean it if needed and return */ + + ptr = &handle->blocks[ret]; + + /* If it was used, we might need to clean up */ + if (ptr->used) + { + switch (ptr->locked) + { + case LOCK_WRITE_ACTIVE: + writering_write_nonblock_wait (handle); + break; + case LOCK_READ_ACTIVE: + writering_read_nonblock_wait (handle); + break; + case LOCK_FREE: + break; + default: + assert (0); + } + + assert (ptr->locked == LOCK_FREE); + + if (ptr->dirty) + { + if (handle->write_active >= 0) + writering_write_nonblock_wait (handle); + assert (handle->write_active < 0); + writering_write_immediate (handle, + ptr->startofs, ptr->data, ptr->used); + ptr->dirty = 0; + } + + ptr->used=0; + } + + return ret; +} + +/* + * Remove all blocks that could contain data from this range (even if they + * don't have any data in that range now); + */ +static void writering_clear (writering_handle handle, WRR_OFFSET ofs, + unsigned int size) +{ + int i; + WRR_OFFSET range_start, range_stop; + + range_start = ofs; range_stop = range_start + size; + + for (i=0; iblockcount; ++i) + { + writering_block * ptr; + if (!handle->blocks[i].used) + continue; + + WRR_OFFSET block_start = handle->blocks[i].startofs; + WRR_OFFSET block_stop = block_start + handle->blocksize; + + if (block_stop <= range_start || block_start >= range_stop) + continue; + + /* block is in range */ + assert ( (block_start >= range_start && block_stop <= range_stop) + || (block_start < range_start && block_stop <= range_stop) + || (block_start > range_start && block_stop >= range_stop)); + + ptr = &handle->blocks[i]; + if (ptr->locked) + { + if (ptr->locked == LOCK_WRITE_ACTIVE) + { + writering_write_nonblock_wait (handle); + } + else + { + assert (ptr->locked == LOCK_READ_ACTIVE); + writering_read_nonblock_wait (handle); + } + assert (ptr->locked == LOCK_FREE); + } + + /* if the block is dirty, write it to disk */ + if (ptr->dirty) + { + if (handle->write_active >= 0) + writering_write_nonblock_wait (handle); + assert (handle->write_active < 0); + writering_write_immediate (handle, ptr->startofs, + ptr->data, ptr->used); + } + + /* mark block as unused */ + ptr->used = 0; + ptr->locked = 0; + ptr->startofs = (WRR_OFFSET) -1; + + } + +#ifndef NDEBUG + if (handle->debug) + writering_validate (handle); +#endif +} + +void writering_write (writering_handle handle, WRR_OFFSET ofs, + const void * data, unsigned int size) +{ + int block; + writering_block * blockptr; + unsigned int todo = size; + WRR_OFFSET curofs = ofs; + + handle->readops = 0; + handle->lastread = 0; + + if (!size) + return; + + if (handle->sync) + { + /* track filesize */ + if (ofs + size > handle->filesize) + handle->filesize = ofs + size; + writering_write_immediate (handle, ofs, data, size); + return; + } + + /* nonblocking mode */ + + while (todo) + { + unsigned int thiswrite; + /* See if we have a memory block for this write */ + block = writering_findblock (handle, curofs); + + if (block>= 0) + { + blockptr = &handle->blocks[block]; + + /* check that we're not creating a hole in the block */ + if ((blockptr->startofs + blockptr->used) < curofs) + { + unsigned int readsize; + unsigned int read; + + /* wait until background read finishes */ + if (handle->read_active >= 0) + writering_read_nonblock_wait (handle); + assert (handle->read_active < 0); + + /* try to read from ptr->used until end of block */ + /* NOTE: we could be trying to read past the end of the file, + * and we can only know for sure if the file is already open; + * However, we do know that no block has an overlapping range so + * we don't have to consider reading from in-memory blocks */ + + /* ensureopen also updates the file size + * (in case we're writing to an existing file that extends beyond + * the furthest byte written to the writering) */ + writering_ensureopen (handle); + + assert (blockptr->startofs < handle->filesize); + + readsize = WRR_MIN(handle->blocksize - blockptr->used, + handle->filesize - blockptr->startofs - blockptr->used); + + /* when the buffered (logical) filesize is larger than the flushed + * filesize we could be reading past the end of the file here */ + read = + writering_read_immediate (handle, blockptr->startofs + blockptr->used, + (char*) blockptr->data + blockptr->used, readsize); + + if (read < readsize) + { + fprintf (stderr, "writering: warning: reading uninitialised data\n"); + memset ((char*) blockptr->data + blockptr->used + read, + 0, readsize - read); + } + + blockptr->used += readsize; + } + } + else + { + /* we need to come up with a new block */ + block = writering_reclaimblock (handle); + blockptr = &handle->blocks[block]; + blockptr->used = 0; + blockptr->startofs = curofs; + + /* Need to clear blocks in our range to avoid overlapping blocks */ + writering_clear (handle, blockptr->startofs, handle->blocksize); + } + + /* add data to the block */ + thiswrite = WRR_MIN(handle->blocksize - (curofs - blockptr->startofs), + todo); + + memcpy ((char *) (blockptr->data) + (curofs - blockptr->startofs), + data, thiswrite); + + /* update dirty & used */ + blockptr->dirty = 1; + blockptr->used = WRR_MAX(blockptr->used, curofs - blockptr->startofs + thiswrite); + data = (char*) data + thiswrite; + todo -= thiswrite; + curofs += thiswrite; + + /* track filesize */ + if (curofs > handle->filesize) + handle->filesize = curofs; + } + +#ifndef NDEBUG + if (handle->debug) + writering_validate (handle); +#endif +} + + +int writering_read (writering_handle handle, WRR_OFFSET ofs, + void * data, unsigned int size) +{ + const unsigned int requested = size; + + if (ofs > handle->lastread || !handle->readops) + ++handle->readops; + handle->lastread = ofs; + + if (!size) + return 0; + + if (handle->sync) + return writering_read_immediate (handle, ofs, data, size); + + + /* check if we have the block; if not; wait for any background reads and do + * an immediate read; + * */ + + + while (size) + { + if (ofs >= handle->filesize) + /* reading beyond end of file */ + break; + + int blocknum = writering_findblock (handle, ofs); + writering_block * ptr; + unsigned int thisread; + + if (blocknum < 0) + { + /* We don't have the block */ + /* Try to reclaim a block and read the whole block from the file */ + blocknum = writering_reclaimblock (handle); + unsigned int read; + + writering_ensureopen (handle); + + /* find blocks still overlapping with our range and free them */ + ptr = &handle->blocks[blocknum]; + ptr->startofs = ofs; + ptr->used = 0; + writering_clear (handle, ptr->startofs, handle->blocksize); + + + /* read data into the block */ + ptr->used = WRR_MIN (handle->blocksize, handle->filesize - ofs); + read = writering_read_immediate (handle, ptr->startofs, + ptr->data, ptr->used); + + if (read < ptr->used) + { + /* fill with zeros; + * This is needed when the logical filesize is larger than ofs + * (because we did a write past ofs) but the real file hasn't been + * updated yet. Reading from the file will result in less than + * ptr->used bytes in this case */ + memset ((char*) ptr->data + read, 0, ptr->used - read); + } + } + + /* read data from blocknum */ + ptr = &handle->blocks[blocknum]; + + /* writering_findblock return the block that *could* hold the data; + * It doesn't mean it has the data; So we check for the case where + * the data we want to read isn't in the block */ + if (ptr->startofs + ptr->used <= ofs) + { + unsigned int read; + + /* We're not reading beyond the filesize, since we checked for that; + * We read the rest from the block (up to EOF) from the file + * (since there could be old data in there if the file existed before + * we started writing); If the read failed we fill with zero's */ + unsigned int readsize; + + /* ensureopen could modify handle->filesize */ + writering_ensureopen (handle); + + readsize = WRR_MIN( + handle->filesize - ptr->startofs - ptr->used, + handle->blocksize - ptr->used); + + if (handle->read_active >= 0) + writering_read_nonblock_wait (handle); + + read = writering_read_immediate (handle, ptr->startofs + ptr->used, + (char*) ptr->data + ptr->used, readsize); + assert (read >= 0); + if (read != readsize) + { + assert (read < readsize); + fprintf (stderr, "warning: writering: reading uninitialized data!\n"); + memset ((char*) ptr->data + ptr->used + read, 0, + readsize - read); + } + ptr->used += readsize; + thisread = WRR_MIN(size, ptr->used - (ofs - ptr->startofs)); + } + else + { + assert (ptr->startofs <= ofs); + thisread = WRR_MIN (size, ptr->used - (ofs - ptr->startofs)); + } + + memcpy (data, (char*) ptr->data + (ofs - ptr->startofs), thisread); + ofs += thisread; + size -= thisread; + data = (char*) data + thisread; + } + +#ifndef NDEBUG + if (handle->debug) + writering_validate (handle); +#endif + + return requested - size; +} + +/*=========================================================================*/ +/*=========================================================================*/ +/*=========================================================================*/ + +void writering_reducemem (writering_handle handle) +{ + unsigned int i, front, back; + unsigned int check = 0; + + if (!handle->blockcount) + return; + + /* pass 1: free all memory that can be released */ + for (i=0; iblockcount; ++i) + { + if (handle->blocks[i].used && handle->blocks[i].dirty) + { + ++check; + continue; + } + + handle->blocks[i].used = 0; + handle->blocks[i].dirty = 0; + free (handle->blocks[i].data); + handle->blocks[i].data = 0; + } + + /* Pass 2: compress list */ + front = 0; back = handle->blockcount-1; + + while (1) + { + if (front == handle->blockcount) + { + /* Nothing could be released; return */ + return; + } + + if (!handle->blocks[front].data) + { + /* Current block is empty */ + /* Find nonempty block at the back */ + while (!handle->blocks[back].data && back > front) + --back; + if (back == front) + { + /* no more full blocks...*/ + break; + } + /* Switch full and empty block */ + handle->blocks[front] = handle->blocks[back]; + handle->blocks[back].data = 0; + } + + ++front; + }; + + /* double check, since I was half asleep when writing this*/ +/* assert (handle->blockcount == front); */ + for (i=0; iblocks[i].data); + for (i=front; iblockcount; ++i) + assert (!handle->blocks[i].data); + + /* Front points to the first empty block */ + handle->blockcount = front; +} + + +void writering_flush (writering_handle handle) +{ + writering_write_flush (handle); + writering_read_flush (handle); +} + + +void writering_reset (writering_handle handle, WRR_OFFSET size) +{ + unsigned int i; + /* file might not yet be open, but that's strange... */ + if (! handle->open) return; + + for (i=0; iblockcount; ++i) + { + writering_block * ptr = &handle->blocks[i]; + if (!ptr->used) + continue; + + if (ptr->startofs + ptr->used < size) + continue; + + /* block is completely in the discarded part; + * discard block */ + if (ptr->startofs >= size) + { + ptr->startofs = (WRR_OFFSET) -1; + ptr->used = 0; + continue; + } + + /* block has to be discarded partially; check for pending writes */ + /* if block is active wait */ + if (handle->read_active == i) + writering_read_nonblock_wait (handle); + if (handle->write_active == i) + writering_write_nonblock_wait (handle); + + assert (ptr->startofs < size); + ptr->used = ptr->startofs + ptr->used - size; + } + + /* all blocks after size are non-active and discarded/truncated + * set filesize */ + handle->ops.reset (handle->ops_data, size); + handle->filesize = size; +} diff --git a/src/mpi/romio/adio/include/ad_tuning.h b/src/mpi/romio/adio/include/ad_tuning.h index f481cccda62..994f9868533 100644 --- a/src/mpi/romio/adio/include/ad_tuning.h +++ b/src/mpi/romio/adio/include/ad_tuning.h @@ -29,6 +29,7 @@ extern int romio_onesided_no_rmw; extern int romio_onesided_always_rmw; extern int romio_onesided_inform_rmw; extern int romio_tunegather; +extern int romio_agg_map_policy; /* set internal variables for tuning environment variables */ void ad_get_env_vars(void); diff --git a/src/mpi/romio/adio/include/adio.h b/src/mpi/romio/adio/include/adio.h index d29a01f76bc..20f5e336d3f 100644 --- a/src/mpi/romio/adio/include/adio.h +++ b/src/mpi/romio/adio/include/adio.h @@ -62,6 +62,8 @@ #define ROMIOCONF_H_INCLUDED #endif +/* probleem is oude makefile */ + #include "mpi.h" #include "mpio.h" #ifdef HAVE_FCNTL_H @@ -235,6 +237,7 @@ typedef struct ADIOI_FileD { MPI_Win io_buf_put_amounts_window; /* Window over the io_buf_put_amounts */ /* External32 */ int is_external32; /* bool: 0 means native view */ + char * datarep; /* active data representation for this file */ } ADIOI_FileD; @@ -286,6 +289,7 @@ typedef struct { #define ADIO_PANFS 161 /* Panasas FS */ #define ADIO_LUSTRE 163 /* Lustre */ #define ADIO_GPFS 168 +#define ADIO_LOGFS 169 /* fake filesystem; layers on other */ #define ADIO_SEEK_SET SEEK_SET #define ADIO_SEEK_CUR SEEK_CUR @@ -294,6 +298,7 @@ typedef struct { #define ADIO_FCNTL_SET_ATOMICITY 180 #define ADIO_FCNTL_SET_DISKSPACE 188 #define ADIO_FCNTL_GET_FSIZE 200 +#define ADIO_FCNTL_SET_SLAVE 202 /* file system feature tests */ #define ADIO_LOCKS 300 /* file system supports fcntl()-style locking */ @@ -402,6 +407,7 @@ void ADIO_Resize(ADIO_File fd, ADIO_Offset size, int *error_code); void ADIO_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code); void ADIO_ResolveFileType(MPI_Comm comm, const char *filename, int *fstype, ADIOI_Fns **ops, int *error_code); +const char * ADIO_FileTypeToPrefix (int filetype); void ADIO_Get_shared_fp(ADIO_File fd, ADIO_Offset size, ADIO_Offset *shared_fp, int *error_code); void ADIO_Set_shared_fp(ADIO_File fd, ADIO_Offset offset, int *error_code); diff --git a/src/mpi/romio/adio/include/adioi.h b/src/mpi/romio/adio/include/adioi.h index 5e930ac16b1..14036e9e6af 100644 --- a/src/mpi/romio/adio/include/adioi.h +++ b/src/mpi/romio/adio/include/adioi.h @@ -920,6 +920,7 @@ int ADIOI_Set_lock64(FDTYPE fd_sys, int cmd, int type, ADIO_Offset offset, int w int ADIOI_Strncpy( char *outstr, const char *instr, size_t maxlen ); char *ADIOI_Strdup( const char * ); +char * ADIOI_Strlower (char *); /* the current MPI standard is not const-correct, and modern compilers warn * about the following sort of code: diff --git a/src/mpi/romio/adio/include/adioi_fs_proto.h b/src/mpi/romio/adio/include/adioi_fs_proto.h index d1d50bf8b54..96a9f85237c 100644 --- a/src/mpi/romio/adio/include/adioi_fs_proto.h +++ b/src/mpi/romio/adio/include/adioi_fs_proto.h @@ -49,4 +49,19 @@ extern struct ADIOI_Fns_struct ADIO_GPFS_operations; /* prototypes are in adio/ad_gpfs/ad_gpfs.h */ #endif +#ifdef ROMIO_ICACHE +extern struct ADIOI_Fns_struct ADIO_ICACHE_operations; +/* prototypes are in adio/ad_icache/ad_icache.h */ +#endif + +#ifdef ROMIO_TRACE +extern struct ADIOI_Fns_struct ADIO_TRACE_operations; +/* prototypes are in adio/ad_trace/ad_trace.h */ +#endif + +#ifdef ROMIO_LOGFS +extern struct ADIOI_Fns_struct ADIO_LOGFS_operations; +#endif + + #endif diff --git a/src/mpi/romio/adio/include/growvector.h b/src/mpi/romio/adio/include/growvector.h new file mode 100644 index 00000000000..9a7004e6e4e --- /dev/null +++ b/src/mpi/romio/adio/include/growvector.h @@ -0,0 +1,96 @@ +#ifndef LOGFS_GROWVECTOR_H +#define LOGFS_GROWVECTOR_H + +#include +#include + +/* always allocate at least 128 bytes */ +#define GROWVECTOR_MINSIZE 128 + +struct growvector_instance; + +typedef struct growvector_instance * growvector_handle; +typedef const struct growvector_instance * growvector_consthandle; + +growvector_handle growvector_create (int elesize, int cap); +void growvector_free (growvector_handle * handle); + +void growvector_clear (growvector_handle handle); + +/* make sure there is room for at least elesize elements */ +/* returns capacity + * If reduce is true, reductions will be allowed*/ +int growvector_reserve (growvector_handle handle, int elesize, int reduce); + +/* make more room in vector */ +void growvector_grow (growvector_handle handle); + +/* return pointer to element */ +static inline void * growvector_get (growvector_handle handle, int ele); +static inline void * growvector_pushback_checked (growvector_handle handle, void * + ele); + +static inline int growvector_elesize (growvector_consthandle handle); + +#ifndef NDEBUG +#define growvector_pushback(a,b,c) \ + growvector_pushback_checked ((a), (b)); assert (c==growvector_elesize((a))); +#else +#define growvector_pushback(a,b,c) \ \ + growvector_pushback_checked ((a), (b)); +#endif + +static inline int growvector_size (growvector_consthandle handle); +static inline int growvector_capacity (growvector_consthandle handle); +/* -- implementation issues -- */ + +struct growvector_instance +{ + char * data; + int capacity; + int size; + int elesize; +}; + +static inline int growvector_elesize (growvector_consthandle handle) +{ + return handle->elesize; +} + +/* Return pointer to start of memory, 0 if vector is empty */ +static inline void * growvector_get_null (growvector_handle handle) +{ + return (handle->size ? handle->data : 0); +} + +static inline void * growvector_get (growvector_handle handle, int ele) +{ + assert (ele < handle->size); + return handle->data + (handle->elesize * ele); +} + +static inline int growvector_size (growvector_consthandle handle) +{ + return handle->size; +} + +static inline int growvector_capacity (growvector_consthandle handle) +{ + return handle->capacity; +} + +static inline void * growvector_pushback_checked (growvector_handle handle, void * + data) +{ + void * mem; + if (handle->capacity == handle->size) + growvector_grow (handle); + assert (handle->size < handle->capacity); + + /* hope GCC/compiler is smart here if elesize <= 32 ... */ + ++handle->size; + mem = growvector_get(handle, handle->size-1); + memcpy (mem, data, handle->elesize); + return mem; +} +#endif diff --git a/src/mpi/romio/adio/include/layered.h b/src/mpi/romio/adio/include/layered.h new file mode 100644 index 00000000000..d5a97e63442 --- /dev/null +++ b/src/mpi/romio/adio/include/layered.h @@ -0,0 +1,138 @@ +#ifndef ROMIO_LAYERED_H_INCLUDE +#define ROMIO_LAYERED_H_INCLUDE + +#include +#include "adio.h" +#include "adioi.h" + +#define ROMIO_LAYER_MAGIC 12396541 + +/** + * Support functions for creating layering adio drivers + * + * The layered drivers cannot change ADIO_File members + * and should always use the layer access functions' + * (layer_data, switchin, switchout) + */ + +typedef struct +{ + int magic; /* error detection */ + void * master_data; /* data for the outer driver */ + void * slave_data; /* original data of the slave driver */ + ADIOI_Fns * master_ops; /* operations structure of the master */ + ADIOI_Fns * slave_ops; /* operations structure of the slave */ + ADIOI_Fns * orig_fns; /* original fns pointer */ +} ADIOI_Layer_data; + + +/* + * Check the magic to verify this is actually a layered FD + */ +static inline void ADIOI_Layer_validate (ADIO_File fd) +{ + assert (fd->fs_ptr && + ((ADIOI_Layer_data*)fd->fs_ptr)->magic == ROMIO_LAYER_MAGIC); +} + +/* return the data pointer of the outer driver + * Cannot be called when the slave driver is SwitchedIn */ +static inline void * ADIOI_Layer_get_data (ADIO_File fd) +{ + ADIOI_Layer_validate (fd); + return ((ADIOI_Layer_data*)fd->fs_ptr)->master_data; +} + +static inline void * ADIOI_Layer_set_data (ADIO_File fd, void * data) +{ + ADIOI_Layer_validate (fd); + ((ADIOI_Layer_data*)fd->fs_ptr)->master_data = data; + return data; +} + +/* + * Restore the ADIO_File structure so that slave functions can be called; + * Restores the fs_ptr pointer & the operations pointer + * Returns a void * pointer that should be passed to SwitchIn to restore the + * outer(master) driver + */ +static inline void * ADIOI_Layer_switch_in (ADIO_File fd) +{ + ADIOI_Layer_data * d; + + ADIOI_Layer_validate (fd); + d = (ADIOI_Layer_data*)fd->fs_ptr; + + fd->fs_ptr = d->slave_data; + fd->fns = d->slave_ops; + return d; +} + +/* + * Restore the master driver + */ +static inline void ADIOI_Layer_switch_out (ADIO_File fd, void * data) +{ + ADIOI_Layer_data * d = (ADIOI_Layer_data*) data; + + + // d = (ADIOI_Layer_data*)fd->fs_ptr; + + /* we assume the operations on the slave DO NOT change the address + * of the fs_ptr or the fns struct */ + assert (fd->fs_ptr == d->slave_data); + assert (fd->fns == d->slave_ops); + + fd->fs_ptr = d; + fd->fns = d->master_ops; + ADIOI_Layer_validate (fd); +} + +/* + * Prepare a valid ADIO_File for layering; + * + * There are two modes: already_open = 1 or already_open = 0 + * + * if already_open = 0, it is assumed that the master is in control + * en the slave hasn't been active/openened yet. + * + * In this case, the layering is done with the specified driver + * as a slave. A copy of the Fns structure passed to the function will + * be used. + * When the function returns, the master driver will be active + * Also, the slave will be openened and the + * SET_SLAVE fcntl will be called on the master + * + * If already_opened is 1, fns is assumed to point to the MASTER + * and fd is should be an valid filehandle already opened by the slave. + * In this case the slave open (and SET_SLAVE fcntl) will not be called. + * + * In both cases the master is active when the function returns + */ +void ADIOI_Layer_init (ADIO_File fd, ADIOI_Fns * fns, void * masterdata, + int * error_code, int already_open); + +/* + * Remove the layering from the ADIO_File structure + * When the function returns, the slave driver will be active + * Returns the void * data of the master driver + */ +void * ADIOI_Layer_done (ADIO_File fd); + + +/* returns TRUE if the trace layer processed the event + * (in this case the calling function should also return), + * FALSE if the event was not handled and the caller should handle the event + */ +int ADIOI_Layer_fcntl (ADIO_File fd, int flag, ADIO_Fcntl_t * fcntl_struct, + int * error_code); + +int ADIOI_Layer_SetInfo (ADIO_File fd, MPI_Info users_info, + int * error_code); + +/* + * Return true if the slave is set and the switch functions can be called + */ +int ADIOI_Layer_is_slave_set (ADIO_File fd); + +#endif diff --git a/src/mpi/romio/adio/include/memstack.h b/src/mpi/romio/adio/include/memstack.h new file mode 100644 index 00000000000..8806294a189 --- /dev/null +++ b/src/mpi/romio/adio/include/memstack.h @@ -0,0 +1,31 @@ +/* + * Auto-growing unordered temp storage structure + */ +#ifndef MEMSTACK_H +#define MEMSTACK_H + + +/* Allocation granularity of memstack, in kb */ +#define MEMSTACK_BLOCKSIZE 1 + +struct memstack_instance; + +typedef struct memstack_instance * memstack_handle; +typedef const struct memstack_instance * memstack_consthandle; + +memstack_handle memstack_create (int elesize); + +void memstack_free (memstack_handle * handle); + +char * memstack_push (memstack_handle handle); +char * memstack_pop (memstack_handle handle); + +int memstack_getsize (memstack_consthandle handle); + +/* remove all elements */ +void memstack_clear (memstack_handle handle); + +/* reduce mem if possible */ +void memstack_reducemem (memstack_handle handle); + +#endif diff --git a/src/mpi/romio/adio/include/rtree.h b/src/mpi/romio/adio/include/rtree.h new file mode 100644 index 00000000000..046228588cd --- /dev/null +++ b/src/mpi/romio/adio/include/rtree.h @@ -0,0 +1,178 @@ +#ifndef ROMIO_RTREE_H +#define ROMIO_RTREE_H + +#include "rtree_config.h" + +#ifndef RTREE_RANGE_TYPE +#define No RTREE_RANGE_TYPE set! +#endif + +#ifndef RTREE_DATA_TYPE +#error No RTREE_DATA_TYPE set! +#endif + +/* + * options + * RTREE_SORT_NODES + * RTREE_SORT_ENTRIES + */ +/* #define RTREE_INIT_MEM */ + + +#define RTREE_CLEAR_MEM + + +/* needs operator < for sorting */ +#define RTREE_SORT_NODES +#define RTREE_SORT_ENTRIES + +struct rtree_node; + +struct rtree; + +typedef struct +{ + RTREE_RANGE_TYPE start; + RTREE_RANGE_TYPE stop; +} rtree_range; + +typedef struct rtree * rtree_handle; +typedef const struct rtree * rtree_const_handle; + +typedef int (*rtree_callback) (const rtree_range * range, RTREE_DATA_TYPE * data, + void * extra); + +typedef void (*rtree_callback_split) (const rtree_range ** sources, + int * mapping, int count, void * extra); + +typedef void (*rtree_callback_copy) (const rtree_range * range, + RTREE_DATA_TYPE * data, RTREE_DATA_TYPE * newdata); + +typedef struct +{ + const rtree_range * range; + RTREE_DATA_TYPE * data; + int depth; + int treedepth; + void * nodeid; + void * parentid; + void * extra; + rtree_const_handle tree; +} rtree_callback_all_info; + +typedef int (*rtree_callback_all) (const rtree_callback_all_info * info); + +/* manipulation */ +rtree_handle rtree_create (); + +void rtree_free (rtree_handle * rtree); + +void rtree_add (rtree_handle tree, const rtree_range * range, + RTREE_DATA_TYPE data); + +/* + * Call function for every rectangle overlapping range + * Continue walking the tree as long as the callback return true + * Return false if the callback returned false, else return true + */ +int rtree_overlap (rtree_const_handle, const rtree_range * range, + rtree_callback callback, void * extra); + +/* return the span range of the tree */ +void rtree_get_range (rtree_const_handle tree, rtree_range * range); + +/* return the depth of the tree */ +int rtree_get_depth (rtree_const_handle tree); + +/* Return maximum number of children in a node */ +int rtree_get_child_max (rtree_const_handle tree); + +/* return the minimum number of children in a node */ +int rtree_get_child_min (rtree_const_handle tree); + +/* return the number of elements in the tree */ +int rtree_get_count (rtree_const_handle tree); + +/* + * Call the callback for every node in the tree. + * Stop when the callback returns false. + * Returns true if all nodes were visited, + * false if stopped early because the callback returned false + * Passes extra to the callback + */ +int rtree_walk (rtree_const_handle tree, rtree_callback callback, void * extra); + + +int rtree_walk_all (rtree_const_handle tree, rtree_callback_all callback, void * + data); + +/* for debugging: write internal tree structure to the screen */ +void rtree_dump (rtree_const_handle tree); + +/* for debugging; validate consistenty of node extents */ +int rtree_check (rtree_const_handle tree); + +/* search for specific range; returns pointer to data if found, 0 otherwise */ +RTREE_DATA_TYPE * rtree_find (rtree_const_handle tree, + const rtree_range * range); + + +/* set callback to be called on entry removal */ +void rtree_set_freefunc (rtree_handle tree, rtree_callback func, void * + extra); + +/* set callback to decide how nodes are split */ +void rtree_set_splitfunc (rtree_handle tree, rtree_callback_split func, void * + data); + +/* try to remove the specified range from the tree; + * The exact range must be present; If found, remove and + * return TRUE; In addition, if data is not 0, data will be + * set to the data value of the range. Returns FALSE + * if the range cannot be found + */ +int rtree_remove (rtree_handle tree, const rtree_range * range, + RTREE_DATA_TYPE * data); + +/**** TODO + + -> range delete + -> range contain : alle ranges volledig in opgegeven range + + ***/ + + +/* + * Copy the whole tree; Optionally calling rtree_callback_copy + * to copy the object data; If copy is NULL, data values will + * be copied to the new tree + */ +rtree_handle rtree_copy (rtree_const_handle tree, rtree_callback_copy copy, + void * extra); + + +void rtree_clear (rtree_handle tree); + +/* return true if the tree is empty */ +int rtree_empty (rtree_const_handle tree); + +/******************** iterator functions **************************/ +/* TODO make inline */ + + +struct rtree_iterator; + +typedef struct rtree_iterator * rtree_iterator_handle; +typedef const struct rtree_iterator * rtree_const_iterator_handle; + +const RTREE_DATA_TYPE * rtree_iterator_get_item_data (rtree_const_iterator_handle iter); + +const rtree_range * rtree_iterator_get_item_range (rtree_const_iterator_handle iter); + +void rtree_iterator_forward (rtree_iterator_handle); + +void rtree_iterator_backward (rtree_iterator_handle); + +rtree_iterator_handle rtree_iterator_create (rtree_handle tree); + +#endif diff --git a/src/mpi/romio/adio/include/typehelper.h b/src/mpi/romio/adio/include/typehelper.h new file mode 100644 index 00000000000..92fe71cb65d --- /dev/null +++ b/src/mpi/romio/adio/include/typehelper.h @@ -0,0 +1,80 @@ +#ifndef ROMIO_TYPEHELPER_H +#define ROMIO_TYPEHELPER_H + + +#include "adio.h" + +/* + * Helper functions for dealing with datatypes + * + * + */ + +/* all can be 0 except processdata */ +typedef struct +{ + /* start processing */ + void (*start) (void * data); + + /* start of continuous region in the second datatype + * (will usually be the filetype) */ + void (*startfragment) (ADIO_Offset fileoffset, ADIO_Offset fragmentsize, + void * data); + + /* return zero to stop processing */ + int (*processdata) (void * membuf, int size, ADIO_Offset fileoffset, void * + data); + + /* end of contiguous region in second filetype + * status = 0 if processing was cancelled */ + void (*stopfragment) (int status, void * data); + + /* enf of processing for these types + * status = 0 if processing was cancelled */ + void (*stop) (int status, void * data); +} DatatypeHandler; + +/* call callback to process the data stream; Displacement is in BYTES in the + * file (not etype extents) */ +int typehelper_processtypes (MPI_Datatype memtype, void * buf, int count, + MPI_Datatype filetype, MPI_Datatype etype, int offset, + int displacement, const DatatypeHandler * callback, void * data); + +/* debug helper: just dump parts to the screen */ +void typehelper_processtypes_debug (MPI_Datatype memtype, void * buf, + int count, MPI_Datatype filetype,MPI_Datatype etype, int offset, + int displacement); + +/* calculate file offset, and call processtypes */ +void typehelper_processoperation (MPI_Datatype memtype, void * buf, int count, + ADIO_File fd, MPI_Offset offset, int file_ptr_type, + const DatatypeHandler * callback, void * data, int debug); + +void typehelper_processtypes_debug (MPI_Datatype memtype, + void * buf, int count, MPI_Datatype filetype, + MPI_Datatype etype, int offset,int displacement); +/* stream the datatype contents (meant for memory datatypes) + * In the callback functions, ofs will refer to the linear offset within the + * datatype at which this fragment starts (so ofs runs from 0 .. + * (MPI_Type_size - 1)*/ +int typehelper_decodememtype (const void * buf, int count, MPI_Datatype memtype, + const DatatypeHandler * callback, void * data); + + +/* + * Calculate the first and last bytes in the file affected by + * a write with these parameters + */ +void typehelper_calcrange (MPI_Datatype etype, MPI_Datatype ftype, + ADIO_Offset disp, ADIO_Offset ofs, int writesize, ADIO_Offset * start, + ADIO_Offset * stop); + +/* + * Iterate over continuous regions in the file access pattern + * offset given in etypes + */ +void typehelper_calcaccess (MPI_Datatype etype, + MPI_Datatype ftype, ADIO_Offset disp, ADIO_Offset offset, + int writesize, const DatatypeHandler * cb, void * userdata); + +#endif diff --git a/src/mpi/romio/adio/include/writering.h b/src/mpi/romio/adio/include/writering.h new file mode 100644 index 00000000000..fab54fb9b5a --- /dev/null +++ b/src/mpi/romio/adio/include/writering.h @@ -0,0 +1,132 @@ +/**************************************************************************** + * Write optimization for LogFS + * (Could also be used when for SEQUENTIAL opened files in MPI-IO) + * + * keep a number of large memory buffers; accept all writes and + * append them to the active buffer; If a buffer is full, start a + * nonblocking write for it. + * + * When running out of buffers, wait until the nonblocking until a buffer + * gets written to disc so that it can be reused + * + * IDEA: to avoid copying, writes larger than a block could + * 1) cause a seek and be written in a nonblocking way + * 2) cause a flush (to avoid the seek) and be written nonblocking + * + * For now, they are added to the cache as is everything else + * + * There will only be ONE nonblocking write at the same time + * + */ + +#ifndef ROMIO_LOGFS_WRITERING_H +#define ROMIO_LOGFS_WRITERING_H + + +#include "writering_types.h" + + +/* To make it easy to test inside and outside ROMIO + * Inside ROMIO, WRR_OFFSET is defined to be ADIO_OFFSET + */ +#ifndef WRR_OFFSET +#error WRR offset type is not defined! +#endif + +struct writering_instance; + +typedef struct writering_instance * writering_handle; +typedef const struct writering_instance * writering_consthandle; + + +typedef struct +{ + /* will be called when te writering is created + * read / write indicate which operations are needed + */ + int (*init) (void * opsdata, int read, int write); + + /* will be called when te writering is destroyed */ + int (*done) (void * opsdata); + + /* initiate a write operation */ + int (*start_write) (void * opsdata, WRR_OFFSET ofs, const void * data, unsigned int size); + + /* return true if the write is finished */ + int (*test_write) (void * opsdata, unsigned int *written); + + /* wait until the write is finished */ + int (*wait_write) (void * opsdata, unsigned int *written); + + /* flush if supported; Will not be called when a + * write/read is in progress */ + int (*flush) (void * opsdata); + + /* truncate file to given size */ + int (*reset) (void * opsdata, WRR_OFFSET ofs); + + /* return filesize; will only be called when file is open */ + int (*getsize) (void * opsdata, WRR_OFFSET * ofs); + + /* start read at specified byte ofs */ + int (*start_read) (void * opsdata, WRR_OFFSET ofs, void * data, unsigned int size); + + /* test for read finish; set bytes read */ + int (*test_read) (void * opsdata, unsigned int * read); + + /* wait for read finish; set bytes read */ + int (*wait_read) (void * opsdata, unsigned int * read); + +} writering_ops; + + +/* readblockcount == 0: no reading allowed; >0: reads allowed + * writeblockcount == 0: no writing allowed; >0: writes allowed + */ +writering_handle writering_create (int blocksize, int maxblockcount, + const writering_ops * operations, void * data, + int read, int write); + +void writering_free (writering_handle * handle); + + +/* try to free mem by releasing all non-dirty blocks + * (this is all readahead data + all flushed write data)*/ +void writering_reducemem (writering_handle handle); + +/* progress writing if needed */ +void writering_progress (writering_handle handle); + +/* if sync is true, disable all read-ahead/write-behind */ +void writering_setsync (writering_handle handle, int sync); + +/* read from file; return bytes read, -1 if error, 0 indicates EOF*/ +int writering_read (writering_handle handle, WRR_OFFSET ofs, void * buf, + unsigned int size); + +void writering_write (writering_handle handle, WRR_OFFSET ofs, + const void * data, unsigned int size); + +/* FLush data in write cache */ +void writering_write_flush (writering_handle handle); + +/* Flush data in read cache */ +void writering_read_flush (writering_handle handle); + +/* do read & write flush */ +void writering_flush (writering_handle handle); + + +/* truncate file to given position; resets read and write pointer */ +void writering_truncate (writering_handle handle, WRR_OFFSET ofs); + +/* return filesize */ +void writering_getsize (writering_handle handle, WRR_OFFSET * ofs); + + +/* set debug mode */ +void writering_setdebug (writering_handle handle, int debug); + +void writering_reset (writering_handle handle, WRR_OFFSET size); + +#endif diff --git a/src/mpi/romio/adio/include/writering_types.h b/src/mpi/romio/adio/include/writering_types.h new file mode 100644 index 00000000000..e9727ae63c7 --- /dev/null +++ b/src/mpi/romio/adio/include/writering_types.h @@ -0,0 +1,10 @@ +#ifndef WRITERING_TYPES_H +#define WRITERING_TYPES_H + +#include "adio.h" + +#define WRR_OFFSET ADIO_Offset + + +#endif + diff --git a/src/mpi/romio/configure.ac b/src/mpi/romio/configure.ac index dd78f68599d..69aeaff0a5a 100644 --- a/src/mpi/romio/configure.ac +++ b/src/mpi/romio/configure.ac @@ -185,7 +185,7 @@ dnl An m4 macro for use with m4_foreach_w and friends. You should modify this dnl list if you want to add a known file system. The list is just whitespace dnl separated, so you can use newlines and tabs as well. m4_define([known_filesystems_m4_w], - [nfs ufs pvfs2 testfs xfs panfs lustre gpfs])dnl + [nfs ufs pvfs2 testfs xfs panfs lustre gpfs logfs])dnl dnl dnl An m4 macro for use with m4_foreach and friends. Expands to a quoted list of dnl quoted elements. A bit easier to use without unintended expansion than the @@ -745,8 +745,9 @@ if test -z "$FILE_SYSTEM" ; then FILE_SYSTEM="ufs nfs" fi -# no matter what, always build testfs -FILE_SYSTEM="testfs $FILE_SYSTEM" +# no matter what, always build testfs and logfs +file_system_logfs=1 +FILE_SYSTEM="testfs logfs $FILE_SYSTEM" # check for valid file system if test -n "$FILE_SYSTEM" ; then @@ -889,6 +890,11 @@ if test -n "$file_system_lustre"; then ) fi +if test -n "$file_system_logfs"; then + AC_DEFINE(ROMIO_LOGFS,1,[Define for ROMIO with LOGFS]) +fi + + if test -n "$file_system_xfs"; then AC_DEFINE(ROMIO_XFS,1,[Define for ROMIO with XFS]) # Check for memalign value diff --git a/src/mpi/romio/mpi-io/close.c b/src/mpi/romio/mpi-io/close.c index 3f2b0b1dd61..2732cb2f8c0 100644 --- a/src/mpi/romio/mpi-io/close.c +++ b/src/mpi/romio/mpi-io/close.c @@ -70,6 +70,8 @@ int MPI_File_close(MPI_File *fh) /* --END ERROR HANDLING-- */ } } + ADIOI_Free (adio_fh->datarep); + /* Because ROMIO expects the MPI library to provide error handler management * routines but it doesn't ever participate in MPI_File_close, we have to diff --git a/src/mpi/romio/mpi-io/delete.c b/src/mpi/romio/mpi-io/delete.c index 3272edc81af..003fadcb162 100644 --- a/src/mpi/romio/mpi-io/delete.c +++ b/src/mpi/romio/mpi-io/delete.c @@ -55,7 +55,7 @@ int MPI_File_delete(ROMIO_CONST char *filename, MPI_Info info) /* resolve file system type from file name; this is a collective call */ ADIO_ResolveFileType(MPI_COMM_SELF, filename, &file_system, &fsops, - &error_code); + &error_code); /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) @@ -78,6 +78,9 @@ int MPI_File_delete(ROMIO_CONST char *filename, MPI_Info info) if (tmp > filename + 1) filename = tmp + 1; + /* TODO: if doing layering -> interfere here + * (see MPI_Open for example) */ + /* call the fs-specific delete function */ (fsops->ADIOI_xxx_Delete)(filename, &error_code); /* --BEGIN ERROR HANDLING-- */ diff --git a/src/mpi/romio/mpi-io/get_view.c b/src/mpi/romio/mpi-io/get_view.c index 946905bb917..47d59c7c605 100644 --- a/src/mpi/romio/mpi-io/get_view.c +++ b/src/mpi/romio/mpi-io/get_view.c @@ -67,8 +67,8 @@ int MPI_File_get_view(MPI_File fh, MPI_Offset *disp, MPI_Datatype *etype, /* --END ERROR HANDLING-- */ *disp = adio_fh->disp; - ADIOI_Strncpy(datarep, - (adio_fh->is_external32 ? "external32": "native"), MPI_MAX_DATAREP_STRING); + ADIOI_Strncpy(datarep, + fh->datarep, MPI_MAX_DATAREP_STRING); MPI_Type_get_envelope(adio_fh->etype, &i, &j, &k, &combiner); if (combiner == MPI_COMBINER_NAMED) *etype = adio_fh->etype; diff --git a/src/mpi/romio/mpi-io/open.c b/src/mpi/romio/mpi-io/open.c index 7cebb5abc79..14a4af86d89 100644 --- a/src/mpi/romio/mpi-io/open.c +++ b/src/mpi/romio/mpi-io/open.c @@ -28,9 +28,55 @@ int MPI_File_open(MPI_Comm comm, const char *filename, int amode, MPI_Info info, /* for user-definde reduce operator */ #include "adio_extern.h" +#if ROMIO_ICACHE || ROMIO_TRACE +#include "layered.h" +#endif + +#ifdef ROMIO_ICACHE +#include "adioi_fs_proto.h" +#define ROMIO_KEY_ICACHE_ENABLE "romio_icache" +#endif + +#ifdef ROMIO_TRACE +#define ROMIO_KEY_TRACE_ENABLE "romio_trace" +#define ROMIO_KEY_TRACE_TWICE_ENABLE "romio_trace_twice" +#endif + +#include + extern int ADIO_Init_keyval; +#if ROMIO_TRACE || ROMIO_ICACHE +/* check if the specified key is set; + * If so, change fns to point to the new fsops, and save the old one in the + * pointer pointed to by 'old' + */ +int check_layered (MPI_Info info, const char * s, ADIOI_Fns * myfns, + ADIOI_Fns ** fns, ADIOI_Fns ** old) +{ + int enable = 0; + + if (MPI_INFO_NULL != info) + { + int flag; + char dummy; + + MPI_Info_get (info, (char *)s, 1, &dummy, &flag); + if (flag) + enable=1; + } + if (enable) + { + *old = *fns; + *fns = myfns; + } + + return enable; +} +#endif + + /*@ MPI_File_open - Opens a file @@ -53,6 +99,18 @@ int MPI_File_open(MPI_Comm comm, ROMIO_CONST char *filename, int amode, MPI_Comm dupcomm = MPI_COMM_NULL; ADIOI_Fns *fsops; static char myname[] = "MPI_FILE_OPEN"; + +#ifdef ROMIO_ICACHE + int ad_icache_enable = 0; + ADIOI_Fns * ad_icache_old; +#endif +#ifdef ROMIO_TRACE + int ad_trace_enable = 0; + int ad_trace_twice_enable = 0; + ADIOI_Fns * ad_trace_old; + ADIOI_Fns * ad_trace_twice_old; +#endif + #ifdef MPI_hpux int fl_xmpi; @@ -127,7 +185,8 @@ int MPI_File_open(MPI_Comm comm, ROMIO_CONST char *filename, int amode, file_system = -1; /* resolve file system type from file name; this is a collective call */ - ADIO_ResolveFileType(dupcomm, filename, &file_system, &fsops, &error_code); + ADIO_ResolveFileType(dupcomm, filename, &file_system, &fsops, + &error_code); /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) { @@ -153,6 +212,33 @@ int MPI_File_open(MPI_Comm comm, ROMIO_CONST char *filename, int amode, *fh = ADIO_Open(comm, dupcomm, filename, file_system, fsops, amode, 0, MPI_BYTE, MPI_BYTE, info, ADIO_PERM_NULL, &error_code); + if (error_code == MPI_SUCCESS) + { + + /* fout hier: volgorde verkeerd bij meerdere slaves; + * stack nodig? + * (google translate: wrong here: order wrong in multiple slaves; Stack + * need? )*/ + +#ifdef ROMIO_TRACE + ad_trace_enable = check_layered (info, ROMIO_KEY_TRACE_ENABLE, + &ADIO_TRACE_operations, &fsops, &ad_trace_old); +#endif + +#ifdef ROMIO_ICACHE + ad_icache_enable = check_layered (info, ROMIO_KEY_ICACHE_ENABLE, + &ADIO_ICACHE_operations, &fsops, &ad_icache_old); +#endif + +#ifdef ROMIO_TRACE + ad_trace_twice_enable = check_layered (info, ROMIO_KEY_TRACE_TWICE_ENABLE, + &ADIO_TRACE_operations, &fsops, &ad_trace_twice_old); +#endif + + } + + + /* --BEGIN ERROR HANDLING-- */ if (error_code != MPI_SUCCESS) { diff --git a/src/mpi/romio/mpi-io/set_view.c b/src/mpi/romio/mpi-io/set_view.c index 4d98fdd1844..563310d06b6 100644 --- a/src/mpi/romio/mpi-io/set_view.c +++ b/src/mpi/romio/mpi-io/set_view.c @@ -26,6 +26,10 @@ int MPI_File_set_view(MPI_File fh, MPI_Offset disp, MPI_Datatype etype, MPI_Data #include "mpioprof.h" #endif +#ifdef ROMIO_LOGFS +#include "../adio/ad_logfs/logfs.h" +#endif + /*@ MPI_File_set_view - Sets the file view @@ -48,6 +52,10 @@ int MPI_File_set_view(MPI_File fh, MPI_Offset disp, MPI_Datatype etype, ADIO_Offset shared_fp, byte_off; ADIO_File adio_fh; +#ifdef ROMIO_LOGFS + int logfs_enabled = 0; +#endif + ROMIO_THREAD_CS_ENTER(); adio_fh = MPIO_File_resolve(fh); @@ -126,6 +134,20 @@ int MPI_File_set_view(MPI_File fh, MPI_Offset disp, MPI_Datatype etype, goto fn_exit; } +#ifdef ROMIO_LOGFS + if (!strcmp(datarep, "logfs") || !strcmp(datarep, "LOGFS")) + { + logfs_enabled = 1; + datarep = "logfs"; + /* TODO: enable logfs */ + /* use info argument for + * - replay on close + * - logfile location + * - logfile limit + */ + } +#endif + if ((datarep == NULL) || (strcmp(datarep, "native") && strcmp(datarep, "NATIVE") && strcmp(datarep, "external32") && @@ -133,15 +155,64 @@ int MPI_File_set_view(MPI_File fh, MPI_Offset disp, MPI_Datatype etype, strcmp(datarep, "internal") && strcmp(datarep, "INTERNAL")) ) { +#ifdef ROMIO_LOGFS + /* logfs is also supported as a data rep + * We could also just lie about the datarep and say it's native in this + * case, but than we cannot check later on to see if it was enabled */ + if (!logfs_enabled) + { +#endif error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, __LINE__, MPI_ERR_UNSUPPORTED_DATAREP, "**unsupporteddatarep",0); error_code = MPIO_Err_return_file(adio_fh, error_code); goto fn_exit; +#ifdef ROMIO_LOGFS } +#endif + } + /* --END ERROR HANDLING-- */ + else + datarep = "native"; + + /* length test of datarep string + * not needed since the previous test would fail for anything + * larger than "logfs" or "native" */ + + /* --END ERROR HANDLING-- */ + /* datarep will be "logfs" or "native" */ + +#ifdef ROMIO_LOGFS + /* check for switching to/from logfs */ + if (!strcmp (fh->datarep,"native") && logfs_enabled) + { + /* switch from native to logfs */ + logfs_activate (fh, info); + + /* pass view info */ + logfs_set_view (fh, disp, etype, filetype); + } + else if (!strcmp (fh->datarep,"logfs") && !logfs_enabled) + { + /* switch from logfs to native */ + /* deactivate layering and force replay */ + logfs_deactivate(fh); + } + /* lastly, deal with a file view in the prefix case, where logfs requested + * via 'logfs:' prefix */ + if (fh->file_system == ADIO_LOGFS) { + logfs_set_view(fh, disp, etype, filetype); + } +#endif + + /* update view */ + if (fh->datarep) + ADIOI_Free (fh->datarep); /* mem from strdup */ + fh->datarep = ADIOI_Strdup (datarep); + if (disp == MPI_DISPLACEMENT_CURRENT) { MPI_Barrier(adio_fh->comm); ADIO_Get_shared_fp(adio_fh, 0, &shared_fp, &error_code);