; Hand this in to:   ece849-staff+hw@ece.cmu.edu

; Required Reading


@InCollection{randell95_evolution_recovery_blocks,
  author =	 "Brian Randell",
  title =	 "The evolution of the recovery block concept",
  booktitle =	 "Software Fault Tolerance",
  editor =	 "Lyu",
  organization = "University of Virginia, VA, USA",
  year =	 "1995",
  chapter =	 "1",
  pages =	 "1--21",
  abstract =	 "This chapter reviews the development of the recovery
                  block approach to software fault tolerance and
                  subsequent work based on this approach. It starts
                  with an account of the development and
                  implementations of the basic recovery block scheme
                  in teh early 1970s at Newcastle, and then goes on to
                  describe work at Newcastle and elsewhere on
                  extensions to the basic scheme, recovery in
                  concurrent systems, and linquistic support for
                  recovery blocks based on the use of object-oriented
                  programming concepts",
  url =                  "http://www.ece.cmu.edu/~ece849/papers/randell95_evolution_recovery_blocks.pdf",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}

@inproceedings{ xurollforward,
  author =	 "J. Xu and B. Randell",
  title =	 "Roll-Forward Error Recovery in Embedded Real-Time
                  Systems",
  pages =	 "414--421",
  abstract =	 "Roll-forward checkpointing schemes [8][10] are
                  developed in order to avoid rollback in the presence
                  of independent faults and to increase the
                  possibility that a task completes within a tight
                  deadline. However, despite of the adoption of
                  roll-forward recovery, these schemes are not
                  necessarily appropriate for time-critical
                  applications because interactions with the external
                  environment and communications between processes
                  must be deferred during checkpoint validation
                  steps...",
  url =		 "http://citeseer.ist.psu.edu/42201.html",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}

@Conference{Li90,
  author =	 "Chung-Chi Jim Li; W. Kent Fuchs",
  title =	 "CATCH -- Compiler-Assisted Techniques for Checkpointing",
  inbook =	 "Proceedings of 20th Fault Tolerant Computing Symposium ",
  year =	 "1990",
  pages =	 "74--81",
  abstract =	 "This paper describes a compiler-based approach to generating
  efficient checkpoints for process recovery. Our approach to check-pointing 
  is programmer, operating system, and hardware transparent. Compile-time 
  information is exploited to maintain the desired checkpoint interval and 
  to reduce the size of checkpoints. Compiler-generated sparse potential
  checkpoint code is used to maintain the desired checkpoint interval.
  Adaptive checkpointing has been developed to reduce the size of
  checkpoints by exploiting potentially large variations in
  memory usage. A training technique is used in selecting low-cost,
  high-coverage potential checkpoints. Since this potential
  checkpoint selection problem is NP-complete, a heuristic
  algorithm has been developed to obtain a quick suboptimal
  solution. These compiler-assisted checkpointing techniques
  have been implemented in a modified version of the GNUC (GCC)
  compiler version 1.34. Experiments utilizing the CATCHGCC 
  compiler on SUN workstations are described.",
  url = "http://ieeexplore.ieee.org//iel2/248/1532/00037973.pdf",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}
    

; Supplemental Reading

    
@article{ chiu02_forced_checkpoints,
  author =	 "Chiu, J.-F.; Ge-Ming Chiu",
  title =	 "Placing forced checkpoints in distributed real-time
                  embedded systems",
  journal =	 "Computing & Control Engineering Journal ",
  volume =	 "13",
  number =	 "4",
  pages =	 "197--205",
  year =	 "2002",
  abstract =	 "An efficient scheme for placing forced checkpoints
                  in a distributed real-time embedded system so as to
                  eliminate useless checkpoints is presented. The
                  notion of primary non-causal intervals is
                  introduced; these intervals are shown to be the only
                  candidates that need to be considered for inserting
                  a minimum number of forced checkpoints. An efficient
                  algorithm is the used to identify the primary
                  non-causal intervals where forced checkpoints should
                  be inserted. The algorithm first converts the
                  original problem to another problem on a directed
                  graph, which may reflect the existence of useless
                  checkpoints. The new problem can be efficiently
                  solved using existing methods.",
  url =
                  "http://www.ece.cmu.edu/~ece749/papers/chiu02_forced_checkpoints.pdf",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "medium-low rankings in 2005",
}


@article{Elnozahy02,
  author =	 "Elnozahy, E.N. ; Alvisi, L. ; Yi-Min Wang ; Johnson,
                  D.B.",
  title =	 "A survey of rollback-recovery protocols in
                  message-passing systems",
  journal =	 "ACM Computing Surveys 34,",
  year =	 "2002",
  pages =	 "375-408",
  number =	 "3",
  abstract =	 "This survey covers rollback-recovery techniques that
                  do not require special language constructs. In the
                  first part of the survey we classify
                  rollback-recovery protocols into checkpoint-based
                  and log-based. Checkpoint-based protocols rely
                  solely on checkpointing for system state
                  restoration. Checkpointing can be coordinated,
                  uncoordinated, or communication-induced. Log-based
                  protocols combine checkpointing with logging of
                  nondeterministic events, encoded in tuples called
                  determinants. Depending on how determinants are
                  logged, log-based protocols can be pessimistic,
                  optimistic, or causal. Throughout the survey, we
                  highlight the research issues that are at the core
                  of rollback-recovery and present the solutions that
                  currently address them. We also compare the
                  performance of different rollback-recovery protocols
                  with respect to a series of desirable properties and
                  discuss the issues that arise in the practical
                  implementations of these protocols",
  url =		 "http://citeseer.nj.nec.com/elnozahy96survey.html",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}

@article{Pradhan94,
  author =	 "Pradhan, D.K. ; Vaidya, N.H.",
  title =	 "Roll-forward checkpointing scheme: a novel
                  fault-tolerant architecture",
  journal =	 "IEEE Transactions on Computers 43,",
  year =	 "1994",
  pages =	 "1163-74",
  number =	 "10",
  abstract =	 "We propose a novel architecture for a fault-tolerant
                  multiprocessor environment. It is assumed that the
                  multiprocessor organization consists of a pool of
                  active processing modules and either a small number
                  of spare modules or active modules with some spare
                  processing capacity. A fault-tolerance scheme is
                  developed for duplex systems using checkpoints. Our
                  scheme, unlike traditional checkpointing schemes,
                  requires no rollbacks for recovering from single
                  faults. The objective is to achieve performance of a
                  triple modular redundant system using duplex system
                  redundancy",
  url =
                  "http://ieeexplore.ieee.org/iel1/12/7726/00324542.pdf",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}

@Conference{Pradhan94,
  author =	 "Pradhan, D.K. ; Vaidya, N.H. ",
  title =	 "Roll-forward and rollback recovery:
                  performance-reliability trade-off",
  inbook =	 "Digest of Papers. The Twenty-Fourth International
                  Symposium on Fault-Tolerant Computing ",
  year =	 "1994",
  pages =	 "186-95",
  abstract =	 "Performance and reliability achieved by a modular
                  redundant system depend on the recovery scheme
                  used. Typically, gain in performance using
                  comparable resources results in reduced
                  reliability. Several high performance computers are
                  noted for small mean time to failure. Performance is
                  measured here in terms of mean and variance of the
                  task completion time, reliability being a task-based
                  measure defined as the probability that a task is
                  completed correctly. Two roll-forward schemes are
                  compared with two rollback schemes for achieving
                  recovery in duplex systems. The roll-forward schemes
                  discussed here are based on a roll-forward
                  checkpointing concept. Roll-forward recovery schemes
                  achieve significantly better performance than
                  rollback schemes by avoiding rollback in most common
                  fault scenarios. It is shown that the roll-forward
                  schemes improve performance with only a small loss
                  in reliability as compared to rollback schemes",
  url =		 "http://citeseer.nj.nec.com/98758.html",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}

@article{Krishna93,
  author =	 "Krishna, C.M. ; Singh, A.D.",
  title =	 "Reliability of checkpointed real-time systems using
                  time redundancy",
  journal =	 "IEEE Transactions on Reliability 42,",
  year =	 "1993",
  pages =	 "427-35",
  number =	 "3",
  abstract =	 "Real-time computers are often used in embedded,
                  life-critical applications where high reliability is
                  important. A common approach to making such systems
                  dependable is to vote on redundant processors
                  executing multiple copies of the same task is
                  described. The processors which make up such voted
                  systems are subjected not only to independently
                  occurring permanent and transient failure, but also
                  to correlated transients brought about by
                  electromagnetic interference from the operating
                  environment. To counteract these transients,
                  checkpointing and time redundancy are required, in
                  addition to processor redundancy. This work analyzes
                  the use of time and device redundancy in systems
                  subject to correlated failure. The tradeoffs in
                  checkpoint placement in such a system are found to
                  be considerably different from those for
                  non-redundant systems without real-time
                  constraints. The authors compare fault-tolerant
                  designs and without a rollback capability,
                  accounting for the increased hardware-failure rate
                  due to processor duplication when faults are
                  detected in hardware, and the doubled execution
                  times when detection is implemented in software",
  url =
                  "http://ieeexplore.ieee.org/iel1/24/6538/00257826.pdf",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}

@Conference{Leu89,
  author =	 "Leu, P.-J. ; Bhargava, B. ",
  title =	 "A model for concurrent checkpointing and recovery
                  using transactions",
  inbook =	 "9th International Conference on Distributed
                  Computing Systems ",
  year =	 "1989",
  pages =	 "423-30",
  abstract =	 "Concurrent checkpointing and recovery using a
                  concurrent transaction processing model which
                  consists of four types of atomic operation and five
                  types of conflict is developed. Each
                  checkpoint/rollback transaction is executed by
                  multiple processes in the system. They can be
                  executed concurrently. It is shown that the
                  consistency of recovery lines and rollback lines
                  established by checkpoint transactions and rollback
                  transactions can be achieved by enforcing
                  serializability on the corresponding
                  transactions. There are two advantages in using a
                  transaction model for concurrent checkpointing and
                  recovery: (1) it is easier to find algorithms to
                  solve a transaction processing problem; and (2)
                  based on this model, related issues of the two
                  corresponding problems can be thought of
                  uniformly. This model clarifies the concepts of
                  concurrent checkpointing and recovery, and brings
                  more ideas for designing algorithms",
  url =
                  "http://ieeexplore.ieee.org//iel2/248/1532/00037973.pdf",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}

@article{Campbell86,
  author =	 "Campbell, R.H. ; Randell, B.",
  title =	 "Error recovery in asynchronous systems",
  journal =	 "IEEE Transactions on Software Engineering SE-12,",
  year =	 "1986",
  pages =	 "811-26",
  number =	 "8",
  abstract =	 "A framework for the provision of fault tolerance in
                  asynchronous systems is introduced. The proposal
                  generalizes the form of simple recovery facilities
                  supported by nested atomic actions in which the
                  exception mechanisms only permit backward error
                  recovery. It allows the construction of systems
                  using both forward and backward error recovery and
                  thus allows the exploitation of the complementary
                  benefits of the two schemes. Backward recovery,
                  forward recovery, and normal processing activities
                  can occur concurrently within the organization
                  proposed. Exception handling is generalized to
                  provide a uniform basis for fault tolerance schemes
                  with the atomic action structure. The generalization
                  includes a resolution scheme for concurrently raised
                  exceptions based on an exception tree and an
                  abortion scheme that permits the termination of the
                  internal atomic actions. An automatic resolution
                  mechanism is outlined for exceptions in atomic
                  actions which allows users to separate their
                  recovery schemes from the details of the underlying
                  algorithms",
  url =
                  "http://www.ece.cmu.edu/~ece749/papers/campbell86_error_recovery.pdf",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}