; Hand this in to: ece849-staff+hw@ece.cmu.edu ; Required Readings @Conference{dawson96_orchestra, author = "Dawson, S. and Jahanian, F. and Mitton, T. and Tung, Teck-Lee ", title = "Testing of Fault-Tolerant and Real-Time Distributed Systems via Protocol Fault Injection", organization = "FTCS", year = "1996", abstract = "As software for distributed systems becomes more complex, ensuring that a system meets its prescribed specification is a growing challenge that confronts software developers. This is particularly important for distributed applications with strict dependability and timeliness contraints. This paper reports on ORCHESTRA a portable fault injection environment for testing implementations of distributed protocols....", url = "http://ieeexplore.ieee.org/iel3/3791/11109/00534626.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{devale02_no_more_excuses, author = "Devale, J and Koopman, P;", title = "Robust Software - No More Excuses", inbook = "International Conference on Dependable Systems and Networks (2002)", year = "2002", abstract = "Software developers identify two major reasons why software systems are not made robust: performance and practicality. This work demonstrates the effectiveness fo general techniques to improve robustness that are practical and yield high performance. We present data from treating three systems to improve robustness by a factor of 5 or more, with a measured performance penalty of under 5\% in nearly every case, and usually under 2\%...", url = "http://www.ece.cmu.edu/~ece749/papers/devale02_no_more_excuses.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @InProceedings{ candea04_microreboot, author = {George Candea and Shinichi Kawamoto and Yuichi Fujiki and Greg Friedman and Armando Fox}, title = {Microreboot -- A Technique for Cheap Recovery}, booktitle = {Symposium on Operating Systems Design and Implementation}, year = {2004}, address = {San Francisco, CA}, month = {December}, url = "http://www.stanford.edu/~candea/papers/microreboot/microreboot.pdf", abstract = "A significant fraction of software failures in large-scale Internet systems are cured by rebooting, even when the exact failure causes are unknown. However, rebooting can be expensive, causing nontrivial service disruption or downtime even when clusters and failover are employed. In this work we separate process recovery from data recovery to enable microrebooting -- a fine-grain technique for surgically recovering faulty application components, without disturbing the rest of the application. We evaluate microrebooting in an Internet auction system running on an application server. Microreboots recover most of the same failures as full reboots, but do so an order of magnitude faster and result in an order of magnitude savings in lost work. This cheap form of recovery engenders a new approach to high availability: microreboots can be employed at the slightest hint of failure, prior to node failover in multi-node clusters, even when mistakes in failure detection are likely; failure and recovery can be masked from end users through transparent call-level retries; and systems can be rejuvenated by parts, without ever being shut down.", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "med", opinions = "", } ; Supplemental Readings @article{dingman95_robustness, author = "Dingman, C. P.; Marshall, J.; Siewiorek, D.", title = "Measuring Robustness of a Fault Tolerant Aerospace System", year = "1995", abstract = "In commercial literature, the meaning of the term fault tolerant has become vague. In this paper we describe a system used to measure the robustness of a fault tolerant aerospace system developed at IBM, present the data collected during the project, and report conclusions and areas for future work.", url = "http://www.ece.cmu.edu/~ece749/papers/dingman95_robustness.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{Madeira00, author = "Madeira, H. ; Costa, D. ; Vieira, M. ", title = "On the emulation of software faults by software fault injection", inbook = "Proceeding International Conference on Dependable Systems and Networks. DSN 2000", year = "2000", pages = "417-26", abstract = "This paper presents an experimental study on the emulation of software faults by fault injection. In a first experiment, a set of real software faults has been compared with faults injected by a SWIFI tool (Xception) to evaluate the accuracy of the injected faults. Results revealed the limitations of Xception (and other SWIFI tools) in the emulation of different classes of software faults (about 44% of the software faults cannot be emulated). The use of field data about real faults was discussed and software metrics were suggested as an alternative to guide the injection process when field data is nor available. In a second experiment, a set of rules for the injection of errors meant to emulate classes of software faults was evaluated. The fault triggers used seem to be the cause for the observed strong impact of the faults in the target system and in the program results. The results also show the influence in the fault emulation of aspects such as code size, complexity of data structures, and recursive versus sequential execution", url = "http://ieeexplore.ieee.org/iel5/6928/18625/00857571.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Mukherjee97, author = "Mukherjee, A. ; Siewiorek, D.P.", title = "Measuring software dependability by robustness benchmarking", journal = "IEEE Transactions on Software Engineering 23,", year = "1997", pages = "366-78", number = "6", abstract = "Inability to identify weaknesses or to quantify advancements in software system robustness frequently hinders the development of robust software systems. Efforts have been made to develop benchmarks of software robustness to address this problem, but they all suffer from significant shortcomings. The paper presents the various features that are desirable in a benchmark of system robustness, and evaluates some existing benchmarks according to these features. A new hierarchically structured approach to building robustness benchmarks, which overcomes many deficiencies of past efforts, is also presented. This approach has been applied to building a hierarchically structured benchmark that tests part of the Unix file and virtual memory systems. The resultant benchmark has successfully been used to identify new response class structures that were not detected in a similar situation by other less organized techniques", url = "http://ieeexplore.ieee.org/iel1/32/12999/00601075.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{Vo97, author = "Vo, K.-P. ; Wang, Y.-M. ; Chung, P.E. ; Huang, Y. ", title = "Xept: a software instrumentation method for exception handling", inbook = "Proceedings. The Eighth International Symposium on Software Reliability Engineering ", year = "1997", pages = "60-9", abstract = "Modern software systems are often built from existing library components. A common problem is how to fix bugs when source code is not available. Xept is an instrumentation language and tool that can be used to add to object code the ability to detect, mask, recover and propagate exceptions from library functions. This helps to alleviate or avoid a large class of errors resulting from function misuses. Examples are given to show applications of Xept in actual software systems", url = "http://ieeexplore.ieee.org/iel3/4993/13710/00630848.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{Siewiorek93, author = "Siewiorek, D.P. ; Hudak, J.J. ; Suh, B.-H. ; Segal, Z. ", title = "Development of a benchmark to measure system robustness", inbook = "Digest of Papers FTCS-23 The Twenty-Third International Symposium on Fault-Tolerant Computing", year = "1993", pages = "88-97", abstract = "An initial attempt at the development of a set of benchmarks to gauge a system's robustness as measured by its ability to tolerate errors is presented. Due to the large domain of system components whose intolerance to errors can lead to system failure, several primitive benchmarks that can be combined into a robustness benchmark suite are presented. Each primitive benchmark targets a system functionality and measure its behavior given erroneous inputs. Four primitive benchmarks have been implemented in this initial effort. They target the file management system, memory access, user application, and the C library functions. The motivation and experimental results of each of these primitive benchmarks are presented in detail followed by an analysis of the results. A methodology to combine the primitive benchmarks to form an overall robustness figure is presented. A list of additional primitive benchmarks is suggested", url = "http://ieeexplore.ieee.org/iel3/4964/13650/00627311.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", }