; Hand this in to:   ece849-staff+hw@ece.cmu.edu

; Required Readings


@Conference{anderson85_swft_evaluation,
  author =	 "Anderson, T.; Barrett, P.A.; Balliwell, D.N.;
                  Moulding, M.R.B",
  title =	 "An Evaluation of Software Fault Tolerance in a
                  Practical System",
  booktitle =	 "Fault-Tolerant Computing 1995, Highlights from
                  Twenty-Five Years",
  organization = "FTCS",
  year =	 "1985",
  abstract =	 "An experimental project to assess the effectiveness
                  of software fault tolerance techniques is
                  described. Techniques were developed for, and
                  applied to, a realistic implementation of a
                  practical real-time system, namely a naval command
                  and control system. Reliability data was collected
                  by running this system with a simulated tactical
                  environment for a variety of action
                  scenarios. Analysis of the data confims that
                  software fault tolerance techniques can
                  significantly enhance system reliability",
  url =
                  "http://ieeexplore-beta.ieee.org//iel3/3846/11214/00532624.pdf",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}

@incollection{levendel95_telecom_cost_effectiveness,
  author =	 "Levendel, Y.",
  title =	 "An Evaluation of Software Fault Tolerance in a
                  Practical System",
  booktitle =	 "Software Fault Tolerance",
  editor =	 "Lyu",
  organization = "University of Virginia, VA, USA",
  year =	 "1995",
  chapter =	 "2",
  pages =	 "279--314",
  abstract =	 "In switching software applications, service quality
                  has traditionally been achieved by the combination
                  of two strategies: high reliance on defect
                  elimination and fault recovery/tolerance. in turn,
                  defect elimination has proven to be costly in
                  staffing, and the implementation of fault tolerance
                  has resulted in high hardware costs, by exclusively
                  relying on preprietary hardware and by using
                  monolithic recovery techniques external to the
                  applications to achieve high quality service. The
                  driving force for this strategy were: no unscheduled
                  downtime, deferred maintenance, easy restart when
                  needed, and easy growth and de-growth. While these
                  objectives are still attractive, they can today be
                  achieved in a more cost-effective way by increased
                  reliance on standard software fault recovery
                  components distributed closer to and inside the
                  applications, and by using hardware sparing recovery
                  at the system level. A recent trend toward rapid
                  software customization will limit traditional
                  software recovery techniques where absolutely
                  necessary to satisfy performance requirements.",
  url =
                  "http://www.ece.cmu.edu/~ece849/papers/levendel95_telecom_cost_effectiveness.pdf",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}

; Pick one of the following

@Conference{wilken90_control_signatures,
  author =	 "Kent Wilken and John Paul Shen",
  title =	 "Continuous signature monitoring: efficient
                  concurrent-detection of processor control errors",
  journal =	 "IEEE Transactions on Computer Aided Design",
  year =	 "1990",
  pages =	 "629--641",
  number =	 "6",
  volume =       "9",
  abstract =	 "This paper presents a low-cost approach to
                  concurrent detection of processor control errors
                  that uses a simple hardware monitor and signatures
                  embedded into the executing program. Existing
                  signature-monitoring techniques detect a large
                  protion of processor control errors at a fraction of
                  the cost of duplication. Analytical methods
                  developed in this paper show that the new approach,
                  continuous signature monitoring (CSM), makes major
                  advances beyond existing techniques. CSM reduces the
                  fraction of undetected control-flow erros by orders
                  of magnitude, to less than 10exp-6. The number of
                  signatures reaches a theoretical minimum, lowered by
                  as much as 3 times to a range of 4-11\%. Signature
                  cost is reduced by placing CSM signatures at
                  location sthat minimize performance loss and (for
                  some architectures) memory overhead. CSM exploits
                  the program memory's SEC/DED code to decrease
                  error-detection latency by as much as 1000 times, to
                  0.016 program memory cycles, without increasing
                  memory overhead. This short latency allows transient
                  faults to be tolerated.",
  url =
                  "http://ieeexplore.ieee.org/iel1/43/1994/00055193.pdf",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}




@Article{ vaidyanathan05_software_rejuvenation,
  author =	 {Vaidyanathan, K.; Trivedi, K.S.},
  title =	 {A comprehensive model for software rejuvenation},
  journal =	 {IEEE Trans. Dependable and Secure Computing},
  year =	 {2005},
  volume =	 {2},
  number =	 {2},
  pages =	 {124--137},
  abstract =	 "Recently, the phenomenon of software aging, one in
                  which the state of the software system degrades with
                  time, has been reported. This phenomenon, which may
                  eventually lead to system performance degradation
                  and/or crash/hang failure, is the result of
                  exhaustion of operating system resources, data
                  corruption, and numerical error accumulation. To
                  counteract software aging, a technique called
                  software rejuvenation has been proposed, which
                  essentially involves occasionally terminating an
                  application or a system, cleaning its internal state
                  and/or its environment, and restarting it. Since
                  rejuvenation incurs an overhead, an important
                  research issue is to determine optimal times to
                  initiate this action. In this paper, we first
                  describe how to include faults attributed to
                  software aging in the framework of Gray's software
                  fault classification (deterministic and transient),
                  and study the treatment and recovery strategies for
                  each of the fault classes. We then construct a
                  semi-Markov reward model based on workload and
                  resource usage data collected from the UNIX
                  operating system. We identify different workload
                  states using statistical cluster analysis, estimate
                  transition probabilities, and sojourn time
                  distributions from the data. Corresponding to each
                  resource, a reward function is then defined for the
                  model based on the rate of resource depletion in
                  each state. The model is then solved to obtain
                  estimated times to exhaustion for each resource. The
                  result from the semi-Markov reward model are then
                  fed into a higher-level availability model that
                  accounts for failure followed by reactive recovery,
                  as well as proactive recovery. This comprehensive
                  model is then used to derive optimal rejuvenation
                  schedules that maximize availability or minimize
                  downtime cost.",
  url =
                  "http://ieeexplore.ieee.org/iel5/8858/31216/01453531.pdf",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}

; Supplemental Reading

@inproceedings{littlewood00_sw_dependability_roadmap,
  author =	 {Bev Littlewood and Lorenzo Strigini},
  title =	 {Software reliability and dependability: a roadmap},
  booktitle =	 {ICSE '00: Proceedings of the Conference on The
                  Future of Software Engineering},
  year =	 {2000},
  isbn =	 {1-58113-253-0},
  pages =	 {175--188},
  location =	 {Limerick, Ireland},
  doi =		 {http://doi.acm.org/10.1145/336512.336551},
  publisher =	 {ACM Press},
  address =	 {New York, NY, USA},
  abstract =	 "Software's increasing role creates both requirements
                  for being able to trust it more than before, and for
                  more people to know how much they can trust their
                  software. A sound engineering approach requires both
                  techniques for producing reliability and sound
                  assessment of the achieved results. Different parts
                  of industry and society face different challenges:
                  the need for education and cultural changes in some
                  areas, the adaptation of known scientific results to
                  practical use in others, and in others still the
                  need to confront inherently hard problems of
                  prediction and decision-making, both to clarify the
                  limits of current understanding and to push them
                  back. We outline the specific difficulties in
                  applying a sound engineering approach to software
                  reliability engineering, some of the current trends
                  and problems and a set of issues that we therefore
                  see as important in an agenda for research in
                  software dependability.",
  url =		 "http://portal.acm.org/citation.cfm?doid=336512.336551",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}

@article{Sullivan95,
  author =	 "Sullivan, G.F. ; Wilson, D.S. ; Masson, G.M.",
  title =	 "Certification of computational results",
  journal =	 "IEEE Transactions on Computers 44,",
  year =	 "1995",
  pages =	 "833-47",
  number =	 "7",
  abstract =	 "We describe a conceptually novel and powerful
                  technique to achieve fault detection and fault
                  tolerance in hardware and software systems. When
                  used for software fault detection, this new
                  technique uses time and software redundancy and can
                  be outlined as follows. In the initial phase, a
                  program is run to solve a problem and store the
                  result. In addition, this program leaves behind a
                  trail of data which we call a certification
                  trail. In the second phase, another program is run
                  which solves the original problem again. This
                  program however, has access to the certification
                  trail left by the first program. Because of the
                  availability of the certification trail, the second
                  phase can be performed by a less complex program and
                  can execute more quickly. In the final phase, the
                  two results are compared and if they agree the
                  results are accepted as correct; otherwise an error
                  is indicated. An essential aspect of this approach
                  is that the second program must always generate
                  either an error indication or a correct output even
                  when the certification trail it receives from the
                  first program is incorrect. We formalize the
                  certification trail approach to fault tolerance and
                  illustrate realizations of it by considering
                  algorithms for the following problems: convex hull,
                  sorting, and shortest path. We compare the
                  certification trail approach to other approaches to
                  fault tolerance",
  url =
                  "http://ieeexplore.ieee.org/iel1/12/8936/00392843.pdf",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}

@Conference{garg98_sw_aging,
  author =	 "S. Garg, A. van Moorsel, K. Vaidyanathan and
                  K. S. Trivedi",
  title =	 "A Methodology for Detection and Estimation of
                  Software Aging",
  booktitle =	 "Int'l. Symp. on Software Reliability Engineering",
  year =	 "1998",
  abstract =	 "The phenomenon of software aging refers to the
                  accumulation of errors during the execution of the
                  software which eventually results in it's crash/hang
                  failure. A gradual performance degradation may also
                  accompany software aging. Pro-active fault
                  management techniques such as 'Software
                  rejuvination' may be used to counteract aging if it
                  exists. In this paper, we propose a methodology for
                  detection and estimation of aging in the UNIX
                  operationg system...",
  url =
                  "http://ieeexplore.ieee.org/iel4/5926/15784/00730892.pdf",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}


@Conference{Arlat88,
  author =	 "Arlat, J. ; Kanoun, K. ; Laprie, J.-C. ",
  title =	 "Dependability evaluation of software
                  fault-tolerance",
  inbook =	 "Eighteenth International Symposium on Fault-Tolerant
                  Computing. Digest of Papers. FTCS-18 ",
  year =	 "1988",
  pages =	 "142-77",
  abstract =	 "The authors present a detailed reliability and
                  safety analysis of the two major software
                  fault-tolerance approaches, recovery blocks (RB) and
                  n-version programming (NVP). The methodology used
                  for the modeling is based on the identification of
                  the possible types of faults introduced during the
                  specification and the implementation, and on the
                  analysis of the behavior following fault
                  activation. The main outcome of the evaluation
                  concerns the derivation of analytical results for
                  identifying the improvement that can result from the
                  use of RB and NVP and for revealing the most
                  critical types of related faults. The study of
                  nested RBs shows that the proposed analysis approach
                  can be applied to such realistic software structures
                  and when an alternate is itself a RB, the results
                  are analogous to the case of the addition of a third
                  alternate. The reliability analysis showed that an
                  improvement has to be expected, but that this
                  improvement would be very low. The study of the
                  discarding of a failed version in NVP shows that
                  this strategy is always worthwhile for safety,
                  whereas, for reliability, it is only worthwhile when
                  independent faults dominate",
  url =
                  "http://ieeexplore.ieee.org//iel3/3846/11214/00532634.pdf",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}

@Conference{Wang93,
  author =	 "Wang, Y.-M. ; Huang, Y. ; Fuchs, W.K. ",
  title =	 "Progressive retry for software error recovery in
                  distributed systems",
  inbook =	 "Digest of Papers FTCS-23 The Twenty-Third
                  International Symposium on Fault-Tolerant Computing",
  year =	 "1993",
  pages =	 "138-44",
  abstract =	 "A method of execution retry for bypassing software
                  faults based on checkpointing, rollback, message
                  reordering, and replaying is described. The authors
                  demonstrate how rollback techniques, previously
                  developed for transient hardware failure recovery,
                  can also be used to recover from software errors by
                  exploiting message reordering to bypass software
                  faults. The approach intentionally increases the
                  degree of nondeterminism and the scope of rollback
                  when a previous retry fails. Examples from
                  experience with telecommunications software systems
                  illustrate the benefits of the scheme",
  url =
                  "http://ieeexplore.ieee.org/iel3/4964/13650/00627317.pdf",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}

@article{Taylor82,
  author =	 "Taylor, D.J. ; Black, J.P.",
  title =	 "Principles of data structure error correction",
  journal =	 "IEEE Transactions on Computers C-31,",
  year =	 "1982",
  pages =	 "602-8",
  number =	 "7",
  abstract =	 "Error correction in robust data structures is a
                  difficult problem. Several algorithms for correcting
                  structural errors, in certain list and tree
                  structures, are now known. These algorithms have
                  been examined to determine common design features
                  which may prove useful in the design of correction
                  algorithms for other structures. This paper presents
                  a summary of the algorithms studied and the design
                  principles which were derived. The paper is not a
                  `cookbook' for constructing error correction
                  algorithms but should prove useful to those
                  designing such algorithms. Implications for the
                  design of robust data structures, so that correction
                  may be done easily, are also briefly discussed",
  url =
                  "http://www.ece.cmu.edu/~ece749/papers/taylor82_data_structure_ec.pdf",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}