; Hand this in to: ece749+hw@ece.cmu.edu ; Required Readings @Conference{anderson85_swft_evaluation, author = "Anderson, T.; Barrett, P.A.; Balliwell, D.N.; Moulding, M.R.B", title = "An Evaluation of Software Fault Tolerance in a Practical System", booktitle = "Fault-Tolerant Computing 1995, Highlights from Twenty-Five Years", organization = "FTCS", year = "1985", abstract = "An experimental project to assess the effectiveness of software fault tolerance techniques is described. Techniques were developed for, and applied to, a realistic implementation of a practical real-time system, namely a naval command and control system. Reliability data was collected by running this system with a simulated tactical environment for a variety of action scenarios. Analysis of the data confims that software fault tolerance techniques can significantly enhance system reliability", url = "http://www.ece.cmu.edu/~ece749/papers/anderson85_swft_evaluation.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @inbook{levendel95_telecom_cost_effectiveness, author = "Levendel, Y.", title = "Software Fault Tolerance", editor = "Lyu", organization = "University of Virginia, VA, USA", year = "1995", chapter = "2", pages = "279--314", abstract = "In switching software applications, service quality has traditionally been achieved by the combination of two strategies: high reliance on defect elimination and fault recovery/tolerance. in turn, defect elimination has proven to be costly in staffing, and the implementation of fault tolerance has resulted in high hardware costs, by exclusively relying on preprietary hardware and by using monolithic recovery techniques external to the applications to achieve high quality service. The driving force for this strategy were: no unscheduled downtime, deferred maintenance, easy restart when needed, and easy growth and de-growth. While these objectives are still attractive, they can today be achieved in a more cost-effective way by increased reliance on standard software fault recovery components distributed closer to and inside the applications, and by using hardware sparing recovery at the system level. A recent trend toward rapid software customization will limit traditional software recovery techniques where absolutely necessary to satisfy performance requirements.", url = "http://www.ece.cmu.edu/~ece749/papers/levendel95_telecom_cost_effectiveness.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } ; Pick one of the following @Conference{wilken90_control_signatures, author = "Shen, J.P.; Wilken, K", title = "Continuous signature monitoring: efficient concurrent-detection of processor control errors", booktitle = "New Frontiers in Testing, International", year = "1988", pages = "914--925", abstract = "This paper presents a low-cost approach to concurrent detection of processor control errors that uses a simple hardware monitor and signatures embedded into the executing program. Existing signature-monitoring techniques detect a large protion of processor control errors at a fraction of the cost of duplication. Analytical methods developed in this paper show that the new approach, continuous signature monitoring (CSM), makes major advances beyond existing techniques. CSM reduces the fraction of undetected control-flow erros by orders of magnitude, to less than 10exp-6. The number of signatures reaches a theoretical minimum, lowered by as much as 3 times to a range of 4-11%. Signature cost is reduced by placing CSM signatures at location sthat minimize performance loss and (for some architectures) memory overhead. CSM exploits the program memory's SEC/DED code to decrease error-detection latency by as much as 1000 times, to 0.016 program memory cycles, without increasing memory overhead. This short latency allows transient faults to be tolerated.", url = "http://www.ece.cmu.edu/~ece749/papers/wilken90_control_signatures.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{garg98_sw_aging, author = "S. Garg, A. van Moorsel, K. Vaidyanathan and K. S. Trivedi", title = "A Methodology for Detection and Estimation of Software Aging", booktitle = "Int'l. Symp. on Software Reliability Engineering", year = "1998", abstract = "The phenomenon of software aging refers to the accumulation of errors during the execution of the software which eventually results in it's crash/hang failure. A gradual performance degradation may also accompany software aging. Pro-active fault management techniques such as 'Software rejuvination' may be used to counteract aging if it exists. In this paper, we propose a methodology for detection and estimation of aging in the UNIX operationg system...", url = "http://www.ece.cmu.edu/~ece749/papers/garg98_sw_aging.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } ; Supplemental Reading @article{Sullivan95, author = "Sullivan, G.F. ; Wilson, D.S. ; Masson, G.M.", title = "Certification of computational results", journal = "IEEE Transactions on Computers 44,", year = "1995", pages = "833-47", number = "7", abstract = "We describe a conceptually novel and powerful technique to achieve fault detection and fault tolerance in hardware and software systems. When used for software fault detection, this new technique uses time and software redundancy and can be outlined as follows. In the initial phase, a program is run to solve a problem and store the result. In addition, this program leaves behind a trail of data which we call a certification trail. In the second phase, another program is run which solves the original problem again. This program however, has access to the certification trail left by the first program. Because of the availability of the certification trail, the second phase can be performed by a less complex program and can execute more quickly. In the final phase, the two results are compared and if they agree the results are accepted as correct; otherwise an error is indicated. An essential aspect of this approach is that the second program must always generate either an error indication or a correct output even when the certification trail it receives from the first program is incorrect. We formalize the certification trail approach to fault tolerance and illustrate realizations of it by considering algorithms for the following problems: convex hull, sorting, and shortest path. We compare the certification trail approach to other approaches to fault tolerance", url = "http://ieeexplore.ieee.org/iel1/12/8936/00392843.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{Arlat88, author = "Arlat, J. ; Kanoun, K. ; Laprie, J.-C. ", title = "Dependability evaluation of software fault-tolerance", inbook = "Eighteenth International Symposium on Fault-Tolerant Computing. Digest of Papers. FTCS-18 ", year = "1988", pages = "142-77", abstract = "The authors present a detailed reliability and safety analysis of the two major software fault-tolerance approaches, recovery blocks (RB) and n-version programming (NVP). The methodology used for the modeling is based on the identification of the possible types of faults introduced during the specification and the implementation, and on the analysis of the behavior following fault activation. The main outcome of the evaluation concerns the derivation of analytical results for identifying the improvement that can result from the use of RB and NVP and for revealing the most critical types of related faults. The study of nested RBs shows that the proposed analysis approach can be applied to such realistic software structures and when an alternate is itself a RB, the results are analogous to the case of the addition of a third alternate. The reliability analysis showed that an improvement has to be expected, but that this improvement would be very low. The study of the discarding of a failed version in NVP shows that this strategy is always worthwhile for safety, whereas, for reliability, it is only worthwhile when independent faults dominate", url = "http://ieeexplore.ieee.org//iel3/3846/11214/00532634.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{Wang93, author = "Wang, Y.-M. ; Huang, Y. ; Fuchs, W.K. ", title = "Progressive retry for software error recovery in distributed systems", inbook = "Digest of Papers FTCS-23 The Twenty-Third International Symposium on Fault-Tolerant Computing", year = "1993", pages = "138-44", abstract = "A method of execution retry for bypassing software faults based on checkpointing, rollback, message reordering, and replaying is described. The authors demonstrate how rollback techniques, previously developed for transient hardware failure recovery, can also be used to recover from software errors by exploiting message reordering to bypass software faults. The approach intentionally increases the degree of nondeterminism and the scope of rollback when a previous retry fails. Examples from experience with telecommunications software systems illustrate the benefits of the scheme", url = "http://ieeexplore.ieee.org/iel3/4964/13650/00627317.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Taylor82, author = "Taylor, D.J. ; Black, J.P.", title = "Principles of data structure error correction", journal = "IEEE Transactions on Computers C-31,", year = "1982", pages = "602-8", number = "7", abstract = "Error correction in robust data structures is a difficult problem. Several algorithms for correcting structural errors, in certain list and tree structures, are now known. These algorithms have been examined to determine common design features which may prove useful in the design of correction algorithms for other structures. This paper presents a summary of the algorithms studied and the design principles which were derived. The paper is not a `cookbook' for constructing error correction algorithms but should prove useful to those designing such algorithms. Implications for the design of robust data structures, so that correction may be done easily, are also briefly discussed", url = "http://www.ece.cmu.edu/~ece749/papers/taylor82_data_structure_ec.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", }