; Hand this in to: ece849-staff+hw@ece.cmu.edu ; Required Readings @article{poledna95_sensor_agreement, author = "Poledna, S.", title = "Fault tolerance in safety critical automotive applications: cost of agreement as a limiting factor ", journal = "FTCS-25", year = "1995", pages = "73-82", abstract = "The high availaility and safety requirements for automotive electronics are currently almost exclusively addressed by application-specific engineering solutions to fault tolerance rather than by systematic approaches. Currently, systematic approaches are ruled out because of cost. The reason for this is that a systematic appraoch to fault tolerance requires(1) replicated components and (2) communications between replicated components to acheive agreement despite of nondeterminism. While replicated components become more and more available with the connection of different control units by means of a multiplex bus, it is show that the cost of agreement on sensor inputs will become the limiting factor ...", url = "http://ieeexplore.ieee.org/iel2/3246/9797/00466996.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @InProceedings{ latronico05_dsn_rel_analysis_distrft, author = {Latronico, E. & Koopman, P.}, title = {Design Time Reliability Analysis of Distributed Fault Tolerance Algorithms}, booktitle = {International Conference on Dependable Systems and Networks}, pages = {486 - 495}, year = {2005}, address = {Yokohama, Japan}, month = {Jun.}, abstract = "Designing a distributed fault tolerance algorithm requires careful analysis of both fault models and diagnosis strategies. A system will fail if there are too many active faults, especially active Byzantine faults. But, a system will also fail if overly aggressive convictions leave inadequate redundancy. For high reliability, an algorithm’s hybrid fault model and diagnosis strategy must be tuned to the types and rates of faults expected in the real world. We examine this balancing problem for two common types of distributed algorithms: clock synchronization and group membership. We show the importance of choosing a hybrid fault model appropriate for the physical faults expected by considering two clock synchronization algorithms. Three group membership service diagnosis strategies are used to demonstrate the benefit of discriminating between permanent and transient faults. In most cases, the probability of failure is dominated by one fault type. By identifying the dominant cause of failure, one can tailor an algorithm appropriately at design time, yielding significant reliability gain.", url = "http://ieeexplore.ieee.org/iel5/9904/31476/01467823.pdf?tp=&arnumber=1467823&isnumber=31476", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{cristian88_group_membership, author = "Cristian, F. ", title = "Agreeing on who is present and who is absent in a synchronous distributed system", booktitle = "Eighteenth International Symposium on Fault-Tolerant Computing. Digest of Papers. FTCS-18 ", year = "1988", pages = "206-11", abstract = "The author describes his system model and failure assumptions by precisely specifying the processor group membership problem. He then gives two protocols for solving this problem. The protocols provide all correct processors with constituent views of the processor group membership. They also guarantee bounded processor failure detection and join processing delays despite any number of performance failures that do not cause network partitioning. The first protocol provides very fast processor failure detection but can require a significant message traffic overhead, even when no failures occur. To reduce this overhead, the author derives the second protocol, which has a (provable) minimal message overhead in the absence of failures but provides a longer failure detection delay and is more complex. He concludes by comparing his approach with other known approaches", url = "http://ieeexplore.ieee.org/iel2/210/275/00005321.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } ; Supplemental Readings @article{ cristian91_agreement, author = "F. Cristian", title = "Reaching Agreement on Processor-Group Membership in Synchronous Distributed Systems", journal = "Distributed Computing", volume = "4", number = "4", publisher = "Springer-Verlag", address = "Berlin, Heidelberg, New York, Tokyo", pages = "175--188", year = "1991", abstract = "Reaching agreement on the identity of correctly functioning processors of a distributed system in the presence of random communication delays, failures and processor joins is a fundamental problem in fault-tolerant distributed systems. Assuming a synchronous communication network that is not subject to partition occurrences, we specify the processor-group membership problem and we propose three simple protocols for solving it. The protocols provide all correct processors with consistent...", url = "http://citeseer.nj.nec.com/cristian91reaching.html", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Pease80, author = "Pease, M. ; Shostak, R. ; Lamport, L.", title = "Reaching agreement in the presence of faults", journal = "Journal of the Association for Computing Machinery 27,", year = "1980", pages = "228-34", number = "2", abstract = "The problem addressed here concerns a set of isolated processors, some unknown subset of which may be faulty, that communicate only by means of two-party messages. Each nonfaulty processor has a private value of information that must be communicated to each other nonfaulty processor. Nonfaulty processors always communicate honestly, whereas faulty processors may lie. The problem is to devise an algorithm in which processors communicate their own values and relay values received from others that allows each nonfaulty processor to infer a value for each other processor. The value inferred for a nonfaulty processor must be that processor's private value, and the value inferred for a faulty one must be consistent with the corresponding value inferred by each other nonfaulty processor. It is shown that the problem is solvable for, and only for, n>or=3m+1, where m is the number of faulty processors and n is the total number. It is also shown that if faulty processors can refuse to pass on information but cannot falsely relay information, the problem is solvable for arbitrary n>or=m>or=0. This weaker assumption can be approximated in practice using cryptographic methods", url = "http://doi.acm.org/10.1145/322186.322188", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{Lampson96, author = "Lampson, B.W. ", title = "How to build a highly available system using consensus", inbook = "Babaoglu, O. Marzullo, K. ", year = "1996", pages = "1-17", abstract = "Lamport showed that a replicated deterministic state machine is a general way to implement a highly available system, given a consensus algorithm that the replicas can use to agree on each input. His Paxos algorithm is the most fault-tolerant way to get consensus without real-time guarantees. Because general consensus is expensive, practical systems reserve it for emergencies and use leases (locks that time out) for most of the computing. This paper explains the general scheme for efficient highly available computing, gives a general method for understanding concurrent and fault-tolerant programs, and derives the Paxos algorithm as an example of the method", url = "http://citeseer.nj.nec.com/lampson96how.html", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Lamport98, author = "Lamport, T.", title = "The part-time parliament", journal = "ACM Transactions on Computer Systems 16,", year = "1998", pages = "133-69", number = "2", abstract = "Recent archaeological discoveries on the island of Paxos reveal that the parliament functioned despite the peripatetic propensity of its part time legislators. The legislators maintained consistent copies of the parliamentary record, despite their frequent forays from the chamber and the forgetfulness of their messengers. The Paxon parliament's protocol provides a new way of implementing the state machine approach to the design of distributed systems", url = "http://citeseer.nj.nec.com/lamport00parttime.html", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Birman93, author = "Birman, K..P.", title = "The process group approach to reliable distributed computing", journal = "Communications of the ACM 36,", year = "1993", pages = "36-53", number = "12", abstract = "One might expect the reliability of a distributed system to correspond directly to the reliability of its constituents, but this is not always the case. The mechanisms used to structure a distributed system and to implement cooperation between components play a vital role in determining the reliability of the system. Many contemporary distributed operating systems have placed emphasis on communication performance, overlooking the need for tools to integrate components into a reliable whole. The communication primitives supported give generally reliable behaviour, but exhibit problematic semantics when transient failures or system configuration changes occur. The resulting building blocks are, therefore, unsuitable for facilitating the construction of systems where reliability is important. This article reviews 10 years of research on ISIS, a system that provides tools to support the construction of reliable distributed software. The thesis underlying ISIS is that development of reliable distributed software can be simplified using process groups and group programming tools. This article describes the approach taken, surveys the system, and discusses experiences with real applications", url = "http://doi.acm.org/10.1145/163298.163303", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Poledna95, author = "Poledna, S.", title = "Tolerating sensor timing faults in highly responsive hard real-time systems", journal = "IEEE Transactions on Computers 44,", year = "1995", pages = "181-91", number = "2", abstract = "Real-time systems that have to respond to environmental state changes within a very short latency period often use event-triggered task activation. If the system has to function correctly in the presence of sensor faults, event-triggered task activation is not reliable. Faulty sensors may cause task activations to occur too early, too late, or task activations are omitted entirely. In particular, early task activations can overload the system. Time-triggered task activation is reliable, but by defining a competitiveness ratio it is shown that the processor utilization for highly responsive tasks is unacceptably low. To overcome the problems of event-triggered task activation while preserving its good performance the task-splitting model is introduced. The task-splitting model integrates fault tolerance into the analysis and construction of hard real-time systems by using a combination of event-triggered and time-triggered task activation. Based on a general task model, it is independent of any particular scheduling algorithm. The result of this work has influenced the design of a new operating system which will be applied in a robust automotive engine controller of the next generation", url = "http://ieeexplore.ieee.org/iel1/12/8353/00364530.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Parker83, author = "Parker, D.S., Jr. ; Popek, G.J. ; Rudisin, G. ; Stoughton, A. ; Walker, B.J. ; Walton, E. ; Chow, J.M. ; Edwards, D. ; Kiser, S. ; Kline, C.", title = "Detection of mutual inconsistency in distributed systems", journal = "IEEE Transactions on Software Engineering SE-9,", year = "1983", pages = "240-7", number = "3", abstract = "Many distributed systems are now being developed to provide users with convenient access to data via some kind of communications network. In many cases it is desirable to keep the system functioning even when it is partitioned by network failures. A serious problem in this context is how one can support redundant copies of resources such as files (for the sake of reliability) while simultaneously monitoring their mutual consistency (the equality of multiple copies). This is difficult since network failures can lead to inconsistency, and disrupt attempts at maintaining consistency. In fact, even the detection of inconsistent copies is a nontrivial problem. Naive methods either (1) compare the multiple copies entirely or (2) perform simple tests which will diagnose some consistent copies as inconsistent. Here a new approach, involving version vectors and origin points, is presented and shown to detect single file, multiple copy mutual inconsistency effectively. The approach has been used in the design of LOCUS, a local network operating system at UCLA", url = "http://www.ece.cmu.edu/~ece749/papers/parker83_mutual_inconsistency.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Davidson86, author = "Davidson, S.B. ; Garcia-Molina, H. ; Skeen, D.", title = "Consistency in partitioned networks", journal = "Computing Surveys 17,", year = "1986", pages = "341-70", number = "3", abstract = "Recently, several strategies have been proposed for transaction processing in partitioned distributed database systems with replicated data. These strategies are surveyed in light of the competing goals of maintaining correctness and achieving high availability. Extensions and combinations are then discussed, and guidelines are presented for selecting strategies for particular applications", url = "http://doi.acm.org/10.1145/5505.5508", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", }