; Hand this in to: ece849-staff+hw@ece.cmu.edu ; Required Readings @Conference{anderson85_swft_evaluation, author = "Anderson, T.; Barrett, P.A.; Balliwell, D.N.; Moulding, M.R.B", title = "An Evaluation of Software Fault Tolerance in a Practical System", booktitle = "Fault-Tolerant Computing 1995, Highlights from Twenty-Five Years", organization = "FTCS", year = "1985", abstract = "An experimental project to assess the effectiveness of software fault tolerance techniques is described. Techniques were developed for, and applied to, a realistic implementation of a practical real-time system, namely a naval command and control system. Reliability data was collected by running this system with a simulated tactical environment for a variety of action scenarios. Analysis of the data confims that software fault tolerance techniques can significantly enhance system reliability", url = "http://ieeexplore-beta.ieee.org//iel3/3846/11214/00532624.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @incollection{levendel95_telecom_cost_effectiveness, author = "Levendel, Y.", title = "An Evaluation of Software Fault Tolerance in a Practical System", booktitle = "Software Fault Tolerance", editor = "Lyu", organization = "University of Virginia, VA, USA", year = "1995", chapter = "2", pages = "279--314", abstract = "In switching software applications, service quality has traditionally been achieved by the combination of two strategies: high reliance on defect elimination and fault recovery/tolerance. in turn, defect elimination has proven to be costly in staffing, and the implementation of fault tolerance has resulted in high hardware costs, by exclusively relying on preprietary hardware and by using monolithic recovery techniques external to the applications to achieve high quality service. The driving force for this strategy were: no unscheduled downtime, deferred maintenance, easy restart when needed, and easy growth and de-growth. While these objectives are still attractive, they can today be achieved in a more cost-effective way by increased reliance on standard software fault recovery components distributed closer to and inside the applications, and by using hardware sparing recovery at the system level. A recent trend toward rapid software customization will limit traditional software recovery techniques where absolutely necessary to satisfy performance requirements.", url = "http://www.ece.cmu.edu/~ece849/papers/levendel95_telecom_cost_effectiveness.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } ; Pick one of the following @Conference{wilken90_control_signatures, author = "Kent Wilken and John Paul Shen", title = "Continuous signature monitoring: efficient concurrent-detection of processor control errors", journal = "IEEE Transactions on Computer Aided Design", year = "1990", pages = "629--641", number = "6", volume = "9", abstract = "This paper presents a low-cost approach to concurrent detection of processor control errors that uses a simple hardware monitor and signatures embedded into the executing program. Existing signature-monitoring techniques detect a large protion of processor control errors at a fraction of the cost of duplication. Analytical methods developed in this paper show that the new approach, continuous signature monitoring (CSM), makes major advances beyond existing techniques. CSM reduces the fraction of undetected control-flow erros by orders of magnitude, to less than 10exp-6. The number of signatures reaches a theoretical minimum, lowered by as much as 3 times to a range of 4-11\%. Signature cost is reduced by placing CSM signatures at location sthat minimize performance loss and (for some architectures) memory overhead. CSM exploits the program memory's SEC/DED code to decrease error-detection latency by as much as 1000 times, to 0.016 program memory cycles, without increasing memory overhead. This short latency allows transient faults to be tolerated.", url = "http://ieeexplore.ieee.org/iel1/43/1994/00055193.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Article{ vaidyanathan05_software_rejuvenation, author = {Vaidyanathan, K.; Trivedi, K.S.}, title = {A comprehensive model for software rejuvenation}, journal = {IEEE Trans. Dependable and Secure Computing}, year = {2005}, volume = {2}, number = {2}, pages = {124--137}, abstract = "Recently, the phenomenon of software aging, one in which the state of the software system degrades with time, has been reported. This phenomenon, which may eventually lead to system performance degradation and/or crash/hang failure, is the result of exhaustion of operating system resources, data corruption, and numerical error accumulation. To counteract software aging, a technique called software rejuvenation has been proposed, which essentially involves occasionally terminating an application or a system, cleaning its internal state and/or its environment, and restarting it. Since rejuvenation incurs an overhead, an important research issue is to determine optimal times to initiate this action. In this paper, we first describe how to include faults attributed to software aging in the framework of Gray's software fault classification (deterministic and transient), and study the treatment and recovery strategies for each of the fault classes. We then construct a semi-Markov reward model based on workload and resource usage data collected from the UNIX operating system. We identify different workload states using statistical cluster analysis, estimate transition probabilities, and sojourn time distributions from the data. Corresponding to each resource, a reward function is then defined for the model based on the rate of resource depletion in each state. The model is then solved to obtain estimated times to exhaustion for each resource. The result from the semi-Markov reward model are then fed into a higher-level availability model that accounts for failure followed by reactive recovery, as well as proactive recovery. This comprehensive model is then used to derive optimal rejuvenation schedules that maximize availability or minimize downtime cost.", url = "http://ieeexplore.ieee.org/iel5/8858/31216/01453531.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } ; Supplemental Reading @inproceedings{littlewood00_sw_dependability_roadmap, author = {Bev Littlewood and Lorenzo Strigini}, title = {Software reliability and dependability: a roadmap}, booktitle = {ICSE '00: Proceedings of the Conference on The Future of Software Engineering}, year = {2000}, isbn = {1-58113-253-0}, pages = {175--188}, location = {Limerick, Ireland}, doi = {http://doi.acm.org/10.1145/336512.336551}, publisher = {ACM Press}, address = {New York, NY, USA}, abstract = "Software's increasing role creates both requirements for being able to trust it more than before, and for more people to know how much they can trust their software. A sound engineering approach requires both techniques for producing reliability and sound assessment of the achieved results. Different parts of industry and society face different challenges: the need for education and cultural changes in some areas, the adaptation of known scientific results to practical use in others, and in others still the need to confront inherently hard problems of prediction and decision-making, both to clarify the limits of current understanding and to push them back. We outline the specific difficulties in applying a sound engineering approach to software reliability engineering, some of the current trends and problems and a set of issues that we therefore see as important in an agenda for research in software dependability.", url = "http://portal.acm.org/citation.cfm?doid=336512.336551", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Sullivan95, author = "Sullivan, G.F. ; Wilson, D.S. ; Masson, G.M.", title = "Certification of computational results", journal = "IEEE Transactions on Computers 44,", year = "1995", pages = "833-47", number = "7", abstract = "We describe a conceptually novel and powerful technique to achieve fault detection and fault tolerance in hardware and software systems. When used for software fault detection, this new technique uses time and software redundancy and can be outlined as follows. In the initial phase, a program is run to solve a problem and store the result. In addition, this program leaves behind a trail of data which we call a certification trail. In the second phase, another program is run which solves the original problem again. This program however, has access to the certification trail left by the first program. Because of the availability of the certification trail, the second phase can be performed by a less complex program and can execute more quickly. In the final phase, the two results are compared and if they agree the results are accepted as correct; otherwise an error is indicated. An essential aspect of this approach is that the second program must always generate either an error indication or a correct output even when the certification trail it receives from the first program is incorrect. We formalize the certification trail approach to fault tolerance and illustrate realizations of it by considering algorithms for the following problems: convex hull, sorting, and shortest path. We compare the certification trail approach to other approaches to fault tolerance", url = "http://ieeexplore.ieee.org/iel1/12/8936/00392843.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{garg98_sw_aging, author = "S. Garg, A. van Moorsel, K. Vaidyanathan and K. S. Trivedi", title = "A Methodology for Detection and Estimation of Software Aging", booktitle = "Int'l. Symp. on Software Reliability Engineering", year = "1998", abstract = "The phenomenon of software aging refers to the accumulation of errors during the execution of the software which eventually results in it's crash/hang failure. A gradual performance degradation may also accompany software aging. Pro-active fault management techniques such as 'Software rejuvination' may be used to counteract aging if it exists. In this paper, we propose a methodology for detection and estimation of aging in the UNIX operationg system...", url = "http://ieeexplore.ieee.org/iel4/5926/15784/00730892.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{Arlat88, author = "Arlat, J. ; Kanoun, K. ; Laprie, J.-C. ", title = "Dependability evaluation of software fault-tolerance", inbook = "Eighteenth International Symposium on Fault-Tolerant Computing. Digest of Papers. FTCS-18 ", year = "1988", pages = "142-77", abstract = "The authors present a detailed reliability and safety analysis of the two major software fault-tolerance approaches, recovery blocks (RB) and n-version programming (NVP). The methodology used for the modeling is based on the identification of the possible types of faults introduced during the specification and the implementation, and on the analysis of the behavior following fault activation. The main outcome of the evaluation concerns the derivation of analytical results for identifying the improvement that can result from the use of RB and NVP and for revealing the most critical types of related faults. The study of nested RBs shows that the proposed analysis approach can be applied to such realistic software structures and when an alternate is itself a RB, the results are analogous to the case of the addition of a third alternate. The reliability analysis showed that an improvement has to be expected, but that this improvement would be very low. The study of the discarding of a failed version in NVP shows that this strategy is always worthwhile for safety, whereas, for reliability, it is only worthwhile when independent faults dominate", url = "http://ieeexplore.ieee.org//iel3/3846/11214/00532634.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{Wang93, author = "Wang, Y.-M. ; Huang, Y. ; Fuchs, W.K. ", title = "Progressive retry for software error recovery in distributed systems", inbook = "Digest of Papers FTCS-23 The Twenty-Third International Symposium on Fault-Tolerant Computing", year = "1993", pages = "138-44", abstract = "A method of execution retry for bypassing software faults based on checkpointing, rollback, message reordering, and replaying is described. The authors demonstrate how rollback techniques, previously developed for transient hardware failure recovery, can also be used to recover from software errors by exploiting message reordering to bypass software faults. The approach intentionally increases the degree of nondeterminism and the scope of rollback when a previous retry fails. Examples from experience with telecommunications software systems illustrate the benefits of the scheme", url = "http://ieeexplore.ieee.org/iel3/4964/13650/00627317.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Taylor82, author = "Taylor, D.J. ; Black, J.P.", title = "Principles of data structure error correction", journal = "IEEE Transactions on Computers C-31,", year = "1982", pages = "602-8", number = "7", abstract = "Error correction in robust data structures is a difficult problem. Several algorithms for correcting structural errors, in certain list and tree structures, are now known. These algorithms have been examined to determine common design features which may prove useful in the design of correction algorithms for other structures. This paper presents a summary of the algorithms studied and the design principles which were derived. The paper is not a `cookbook' for constructing error correction algorithms but should prove useful to those designing such algorithms. Implications for the design of robust data structures, so that correction may be done easily, are also briefly discussed", url = "http://www.ece.cmu.edu/~ece749/papers/taylor82_data_structure_ec.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", }