; Hand this in to: ece849-staff+hw@ece.cmu.edu ; Required Reading @InCollection{randell95_evolution_recovery_blocks, author = "Brian Randell", title = "The evolution of the recovery block concept", booktitle = "Software Fault Tolerance", editor = "Lyu", organization = "University of Virginia, VA, USA", year = "1995", chapter = "1", pages = "1--21", abstract = "This chapter reviews the development of the recovery block approach to software fault tolerance and subsequent work based on this approach. It starts with an account of the development and implementations of the basic recovery block scheme in teh early 1970s at Newcastle, and then goes on to describe work at Newcastle and elsewhere on extensions to the basic scheme, recovery in concurrent systems, and linquistic support for recovery blocks based on the use of object-oriented programming concepts", url = "http://www.ece.cmu.edu/~ece849/papers/randell95_evolution_recovery_blocks.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @inproceedings{ xurollforward, author = "J. Xu and B. Randell", title = "Roll-Forward Error Recovery in Embedded Real-Time Systems", pages = "414--421", abstract = "Roll-forward checkpointing schemes [8][10] are developed in order to avoid rollback in the presence of independent faults and to increase the possibility that a task completes within a tight deadline. However, despite of the adoption of roll-forward recovery, these schemes are not necessarily appropriate for time-critical applications because interactions with the external environment and communications between processes must be deferred during checkpoint validation steps...", url = "http://citeseer.ist.psu.edu/42201.html", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{Li90, author = "Chung-Chi Jim Li; W. Kent Fuchs", title = "CATCH -- Compiler-Assisted Techniques for Checkpointing", inbook = "Proceedings of 20th Fault Tolerant Computing Symposium ", year = "1990", pages = "74--81", abstract = "This paper describes a compiler-based approach to generating efficient checkpoints for process recovery. Our approach to check-pointing is programmer, operating system, and hardware transparent. Compile-time information is exploited to maintain the desired checkpoint interval and to reduce the size of checkpoints. Compiler-generated sparse potential checkpoint code is used to maintain the desired checkpoint interval. Adaptive checkpointing has been developed to reduce the size of checkpoints by exploiting potentially large variations in memory usage. A training technique is used in selecting low-cost, high-coverage potential checkpoints. Since this potential checkpoint selection problem is NP-complete, a heuristic algorithm has been developed to obtain a quick suboptimal solution. These compiler-assisted checkpointing techniques have been implemented in a modified version of the GNUC (GCC) compiler version 1.34. Experiments utilizing the CATCHGCC compiler on SUN workstations are described.", url = "http://ieeexplore.ieee.org//iel2/248/1532/00037973.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } ; Supplemental Reading @article{ chiu02_forced_checkpoints, author = "Chiu, J.-F.; Ge-Ming Chiu", title = "Placing forced checkpoints in distributed real-time embedded systems", journal = "Computing & Control Engineering Journal ", volume = "13", number = "4", pages = "197--205", year = "2002", abstract = "An efficient scheme for placing forced checkpoints in a distributed real-time embedded system so as to eliminate useless checkpoints is presented. The notion of primary non-causal intervals is introduced; these intervals are shown to be the only candidates that need to be considered for inserting a minimum number of forced checkpoints. An efficient algorithm is the used to identify the primary non-causal intervals where forced checkpoints should be inserted. The algorithm first converts the original problem to another problem on a directed graph, which may reflect the existence of useless checkpoints. The new problem can be efficiently solved using existing methods.", url = "http://www.ece.cmu.edu/~ece749/papers/chiu02_forced_checkpoints.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "medium-low rankings in 2005", } @article{Elnozahy02, author = "Elnozahy, E.N. ; Alvisi, L. ; Yi-Min Wang ; Johnson, D.B.", title = "A survey of rollback-recovery protocols in message-passing systems", journal = "ACM Computing Surveys 34,", year = "2002", pages = "375-408", number = "3", abstract = "This survey covers rollback-recovery techniques that do not require special language constructs. In the first part of the survey we classify rollback-recovery protocols into checkpoint-based and log-based. Checkpoint-based protocols rely solely on checkpointing for system state restoration. Checkpointing can be coordinated, uncoordinated, or communication-induced. Log-based protocols combine checkpointing with logging of nondeterministic events, encoded in tuples called determinants. Depending on how determinants are logged, log-based protocols can be pessimistic, optimistic, or causal. Throughout the survey, we highlight the research issues that are at the core of rollback-recovery and present the solutions that currently address them. We also compare the performance of different rollback-recovery protocols with respect to a series of desirable properties and discuss the issues that arise in the practical implementations of these protocols", url = "http://citeseer.nj.nec.com/elnozahy96survey.html", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Pradhan94, author = "Pradhan, D.K. ; Vaidya, N.H.", title = "Roll-forward checkpointing scheme: a novel fault-tolerant architecture", journal = "IEEE Transactions on Computers 43,", year = "1994", pages = "1163-74", number = "10", abstract = "We propose a novel architecture for a fault-tolerant multiprocessor environment. It is assumed that the multiprocessor organization consists of a pool of active processing modules and either a small number of spare modules or active modules with some spare processing capacity. A fault-tolerance scheme is developed for duplex systems using checkpoints. Our scheme, unlike traditional checkpointing schemes, requires no rollbacks for recovering from single faults. The objective is to achieve performance of a triple modular redundant system using duplex system redundancy", url = "http://ieeexplore.ieee.org/iel1/12/7726/00324542.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{Pradhan94, author = "Pradhan, D.K. ; Vaidya, N.H. ", title = "Roll-forward and rollback recovery: performance-reliability trade-off", inbook = "Digest of Papers. The Twenty-Fourth International Symposium on Fault-Tolerant Computing ", year = "1994", pages = "186-95", abstract = "Performance and reliability achieved by a modular redundant system depend on the recovery scheme used. Typically, gain in performance using comparable resources results in reduced reliability. Several high performance computers are noted for small mean time to failure. Performance is measured here in terms of mean and variance of the task completion time, reliability being a task-based measure defined as the probability that a task is completed correctly. Two roll-forward schemes are compared with two rollback schemes for achieving recovery in duplex systems. The roll-forward schemes discussed here are based on a roll-forward checkpointing concept. Roll-forward recovery schemes achieve significantly better performance than rollback schemes by avoiding rollback in most common fault scenarios. It is shown that the roll-forward schemes improve performance with only a small loss in reliability as compared to rollback schemes", url = "http://citeseer.nj.nec.com/98758.html", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Krishna93, author = "Krishna, C.M. ; Singh, A.D.", title = "Reliability of checkpointed real-time systems using time redundancy", journal = "IEEE Transactions on Reliability 42,", year = "1993", pages = "427-35", number = "3", abstract = "Real-time computers are often used in embedded, life-critical applications where high reliability is important. A common approach to making such systems dependable is to vote on redundant processors executing multiple copies of the same task is described. The processors which make up such voted systems are subjected not only to independently occurring permanent and transient failure, but also to correlated transients brought about by electromagnetic interference from the operating environment. To counteract these transients, checkpointing and time redundancy are required, in addition to processor redundancy. This work analyzes the use of time and device redundancy in systems subject to correlated failure. The tradeoffs in checkpoint placement in such a system are found to be considerably different from those for non-redundant systems without real-time constraints. The authors compare fault-tolerant designs and without a rollback capability, accounting for the increased hardware-failure rate due to processor duplication when faults are detected in hardware, and the doubled execution times when detection is implemented in software", url = "http://ieeexplore.ieee.org/iel1/24/6538/00257826.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{Leu89, author = "Leu, P.-J. ; Bhargava, B. ", title = "A model for concurrent checkpointing and recovery using transactions", inbook = "9th International Conference on Distributed Computing Systems ", year = "1989", pages = "423-30", abstract = "Concurrent checkpointing and recovery using a concurrent transaction processing model which consists of four types of atomic operation and five types of conflict is developed. Each checkpoint/rollback transaction is executed by multiple processes in the system. They can be executed concurrently. It is shown that the consistency of recovery lines and rollback lines established by checkpoint transactions and rollback transactions can be achieved by enforcing serializability on the corresponding transactions. There are two advantages in using a transaction model for concurrent checkpointing and recovery: (1) it is easier to find algorithms to solve a transaction processing problem; and (2) based on this model, related issues of the two corresponding problems can be thought of uniformly. This model clarifies the concepts of concurrent checkpointing and recovery, and brings more ideas for designing algorithms", url = "http://ieeexplore.ieee.org//iel2/248/1532/00037973.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Campbell86, author = "Campbell, R.H. ; Randell, B.", title = "Error recovery in asynchronous systems", journal = "IEEE Transactions on Software Engineering SE-12,", year = "1986", pages = "811-26", number = "8", abstract = "A framework for the provision of fault tolerance in asynchronous systems is introduced. The proposal generalizes the form of simple recovery facilities supported by nested atomic actions in which the exception mechanisms only permit backward error recovery. It allows the construction of systems using both forward and backward error recovery and thus allows the exploitation of the complementary benefits of the two schemes. Backward recovery, forward recovery, and normal processing activities can occur concurrently within the organization proposed. Exception handling is generalized to provide a uniform basis for fault tolerance schemes with the atomic action structure. The generalization includes a resolution scheme for concurrently raised exceptions based on an exception tree and an abortion scheme that permits the termination of the internal atomic actions. An automatic resolution mechanism is outlined for exceptions in atomic actions which allows users to separate their recovery schemes from the details of the underlying algorithms", url = "http://www.ece.cmu.edu/~ece749/papers/campbell86_error_recovery.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", }