@inproceedings{c4861b2c7d8b4ee7886f5d58b6791a4b,
title = "Application fault tolerance for shrinking resources via the sparse grid combination technique",
abstract = "The need to make large-scale scientific simulations resilient to the shrinking and growing of compute resources arises from exascale computing and adverse operating conditions (fault tolerance). It can also arise from the cloudcomputing context where the cost of these resources can fluctuate. In this paper, we describe how the Sparse Grid Combination Technique can make such applications resilient to shrinking compute resources. The solution of the non-trivial issues of dealing with data redistribution and on-the-fly malleability of process grid information and ULFM MPI communicatorsare described. Results on a 2D advection solver indicate that process recovery time is significantly reduced from the alternate strategy where failed resources are replaced, overall execution time is actually improved from this case and for checkpointing and the execution error remains small, even when multiple failures occur.",
keywords = "Algorithm-based fault tolerance, Cloud computing, Elasticity, PDE solvers, Parallel computing, Process failure recovery, Sparse grid combination technique, ULFM",
author = "Strazdins, {Peter E.} and Ali, {Md Mohsin} and Bert Debusschere",
note = "Publisher Copyright: {\textcopyright} 2016 IEEE.; 30th IEEE International Parallel and Distributed Processing Symposium Workshops, IPDPSW 2016 ; Conference date: 23-05-2016 Through 27-05-2016",
year = "2016",
month = jul,
day = "18",
doi = "10.1109/IPDPSW.2016.210",
language = "English",
series = "Proceedings - 2016 IEEE 30th International Parallel and Distributed Processing Symposium, IPDPS 2016",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "1232--1238",
booktitle = "Proceedings - 2016 IEEE 30th International Parallel and Distributed Processing Symposium, IPDPS 2016",
address = "United States",
}