; Hand this in to: ece849-staff+hw@ece.cmu.edu ; Required Readings @article{maxion00_dependability_cases, author = "Maxion, R.A., Olszewski, R.T.", title = "Eliminating exception handling errors with dependability cases: a comparative, empirical study", organization = "IEEE", year = "2000", volume = "29", number = "9", pages = "888--906", abstract = "Programs fail mainly for two reasons: logic errors in the code and exception failures. Exception failures can account for up to two thirds of system crashes, hence, are worthy of serious attention... ", url = "http://ieeexplore.ieee.org/iel5/32/19000/00877848.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{turek92_distributed_consensus, author = {John Turek and Dennis Shasha}, title = {The Many Faces of Consensus in Distributed Systems}, journal = {IEEE Computer}, volume = {25}, number = {6}, year = {1992}, pages = {8--17}, url = {http://dx.doi.org/10.1109/2.153253}, abstract = "Known results regarding consensus among processors are surveyed and related to practice. The ideas embodied in the various proofs are explained. The goal is to give practitioners some sense of the system hardware and software guarantees that are required to achieve a given level of reliability and performance. The survey focuses on two categories of failures: fail-stop failures, which occur when processors fail by stopping; and Byzantine failures, which occur when processors fail by acting maliciously.", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{ raynal05_failure_detectors, author = {Michel Raynal}, title = {A short introduction to failure detectors for asynchronous distributed systems}, journal = {SIGACT News}, volume = {36}, number = {1}, year = {2005}, pages = {53--70}, url = {http://doi.acm.org/10.1145/1052796.1052806}, abstract = "Since the first version of Chandra and Toueg's seminal paper titled ``Unreliable failure detectors for reliable distributed systems'' in 1991, the failure detector concept has been extensively studied and investigated. This is not at all surprising as failure detection is pervasive in the design, the analysis and the implementation of a lot of fault-tolerant distributed algorithms that constitute the core of distributed system middleware.The literature on this topic is mostly technical and appears mainly in theoretically inclined journals and conferences. The aim of this paper is to offer an introductory survey to the failure detector concept for readers who are not familiar with it and want to quickly understand its aim, its basic principles, its power and limitations. To attain this goal, the paper first describes the motivations that underlie the concept, and then surveys several distributed computing problems showing how they can be solved with the help of an appropriate failure detector. So, this short paper presents motivations, concepts, problems, definitions, and algorithms. It does not contain proofs. It is aimed at people who want to understand basics of failure detectors.", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{fischer85_impossibility_faulty_processor.pdf, author = {M. Fischer, N. Lynch, M. Patterson}, title = {Impossibility of distributed concensus with one faulty processor}, journal = {Journal of the ACM}, volume = {32}, number = {2}, year = {1985}, pages = {374--382}, url = {http://doi.acm.org/10.1145/1052796.1052806}, abstract = "The consensus problem involves an asynchronous system of processes, some of which may be unreliable. The problem is for the reliable processes to agree on a binary value. In this paper, it is shown that every protocol for this problem has the possibility of nontermination, even with only one faulty process. By way of contrast, solutions are known for the synchronous case, the Byzantine Generals problem.", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } ; Supplemental Reading @Conference{De Vale02, author = "De Vale, J. ; Koopman, P.", title = "Robust software - no more excuses", inbook = "Proceedings International Conference on Dependable Systems and Networks", year = "2002", pages = "145-54", abstract = "Software developers identify two main reasons why software systems are not made robust: performance and practicality. We demonstrate the effectiveness of general techniques to improve robustness that are practical and yield high performance. We present data from treating three systems to improve robustness by a factor of 5 or more, with a measured performance penalty of under 5\% in nearly every case, and usually under 2\%. We identify a third possible reason why software systems are not made robust: developer awareness. A case study on three professional development groups evaluated their ability to estimate the robustness of their software. Two groups were able to estimate their software's robustness to some extent, while one group had more divergent results. Although we can overcome the technical challenges, it appears that even experienced developers can benefit from tools to locate robustness failures and training in robustness issues", url = "http://ieeexplore.ieee.org/iel5/7991/22107/01028895.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{Maxion98, author = "Maxion, R.A. ; Olszewski, R.T. ", title = "Improving software robustness with dependability cases", inbook = "Digest of Papers. Twenty-Eighth Annual International Symposium on Fault-Tolerant Computing ", year = "1998", pages = "346-55", abstract = "Programs fail mainly for two reasons: logic errors in the code, and exception failures. Exception failures can account for up to 2/3 of system crashes, hence are worthy of serious attention. Traditional approaches to reducing exception failures, such as code reviews, wallthroughs and formal testing, while very useful, are limited in their ability to address a core problem: the programmer's inadequate coverage of exceptional conditions. The problem of coverage might be rooted in cognitive factors that impede the mental generation (or recollection) of exception cases that would pertain in a particular situation, resulting in insufficient software robustness. This paper describes a study to test the hypothesis that robustness for exception failures can be improved through the use of dependability cases. Dependability cases, derived from safety cases, comprise a methodology based on structured taxonomies and memory aids for helping software designer think about and improve exception-handling coverage. A controlled experiment conducted with 59 subjects revealed a statistically significant 43% increase in exception-handling robustness. An ancillary experiment conducted with 38 subjects provides convergent evidence that the effect is authentic, and not due to programming expertise alone", url = "http://ieeexplore.ieee.org/iel4/5640/15114/00689485.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", }