; Hand this in to:   ece849-staff+hw@ece.cmu.edu

; Required Readings


@article{poledna95_sensor_agreement,
  author =	 "Poledna, S.",
  title =	 "Fault tolerance in safety critical automotive
                  applications: cost of agreement as a limiting factor
                  ",
  journal =	 "FTCS-25",
  year =	 "1995",
  pages =	 "73-82",
  abstract =	 "The high availaility and safety requirements for
                  automotive electronics are currently almost
                  exclusively addressed by application-specific
                  engineering solutions to fault tolerance rather than
                  by systematic approaches. Currently, systematic
                  approaches are ruled out because of cost. The reason
                  for this is that a systematic appraoch to fault
                  tolerance requires(1) replicated components and (2)
                  communications between replicated components to
                  acheive agreement despite of nondeterminism. While
                  replicated components become more and more available
                  with the connection of different control units by
                  means of a multiplex bus, it is show that the cost
                  of agreement on sensor inputs will become the
                  limiting factor ...",
  url =
                  "http://ieeexplore.ieee.org/iel2/3246/9797/00466996.pdf",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}

@InProceedings{ latronico05_dsn_rel_analysis_distrft,
  author =	 {Latronico, E. & Koopman, P.},
  title =	 {Design Time Reliability Analysis of Distributed
                  Fault Tolerance Algorithms},
  booktitle =	 {International Conference on Dependable Systems and
                  Networks},
  pages =	 {486 - 495},
  year =	 {2005},
  address =	 {Yokohama, Japan},
  month =	 {Jun.},
  abstract =	 "Designing a distributed fault tolerance algorithm
                  requires careful analysis of both fault models and
                  diagnosis strategies. A system will fail if there
                  are too many active faults, especially active
                  Byzantine faults. But, a system will also fail if
                  overly aggressive convictions leave inadequate
                  redundancy. For high reliability, an algorithm’s
                  hybrid fault model and diagnosis strategy must be
                  tuned to the types and rates of faults expected in
                  the real world. We examine this balancing problem
                  for two common types of distributed algorithms:
                  clock synchronization and group membership. We show
                  the importance of choosing a hybrid fault model
                  appropriate for the physical faults expected by
                  considering two clock synchronization
                  algorithms. Three group membership service diagnosis
                  strategies are used to demonstrate the benefit of
                  discriminating between permanent and transient
                  faults. In most cases, the probability of failure is
                  dominated by one fault type. By identifying the
                  dominant cause of failure, one can tailor an
                  algorithm appropriately at design time, yielding
                  significant reliability gain.",
  url =
                  "http://ieeexplore.ieee.org/iel5/9904/31476/01467823.pdf?tp=&arnumber=1467823&isnumber=31476",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}



@Conference{cristian88_group_membership,
  author =	 "Cristian, F. ",
  title =	 "Agreeing on who is present and who is absent in a
                  synchronous distributed system",
  booktitle =	 "Eighteenth International Symposium on Fault-Tolerant
                  Computing. Digest of Papers. FTCS-18 ",
  year =	 "1988",
  pages =	 "206-11",
  abstract =	 "The author describes his system model and failure
                  assumptions by precisely specifying the processor
                  group membership problem. He then gives two
                  protocols for solving this problem. The protocols
                  provide all correct processors with constituent
                  views of the processor group membership. They also
                  guarantee bounded processor failure detection and
                  join processing delays despite any number of
                  performance failures that do not cause network
                  partitioning. The first protocol provides very fast
                  processor failure detection but can require a
                  significant message traffic overhead, even when no
                  failures occur. To reduce this overhead, the author
                  derives the second protocol, which has a (provable)
                  minimal message overhead in the absence of failures
                  but provides a longer failure detection delay and is
                  more complex. He concludes by comparing his approach
                  with other known approaches",
  url =
                  "http://ieeexplore.ieee.org/iel2/210/275/00005321.pdf",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}












; Supplemental Readings


@article{ cristian91_agreement,
  author =	 "F. Cristian",
  title =	 "Reaching Agreement on Processor-Group Membership in
                  Synchronous Distributed Systems",
  journal =	 "Distributed Computing",
  volume =	 "4",
  number =	 "4",
  publisher =	 "Springer-Verlag",
  address =	 "Berlin, Heidelberg, New York, Tokyo",
  pages =	 "175--188",
  year =	 "1991",
  abstract =	 "Reaching agreement on the identity of correctly
                  functioning processors of a distributed system in
                  the presence of random communication delays,
                  failures and processor joins is a fundamental
                  problem in fault-tolerant distributed
                  systems. Assuming a synchronous communication
                  network that is not subject to partition
                  occurrences, we specify the processor-group
                  membership problem and we propose three simple
                  protocols for solving it. The protocols provide all
                  correct processors with consistent...",
  url =		 "http://citeseer.nj.nec.com/cristian91reaching.html",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}
@article{Pease80,
  author =	 "Pease, M. ; Shostak, R. ; Lamport, L.",
  title =	 "Reaching agreement in the presence of faults",
  journal =	 "Journal of the Association for Computing Machinery
                  27,",
  year =	 "1980",
  pages =	 "228-34",
  number =	 "2",
  abstract =	 "The problem addressed here concerns a set of
                  isolated processors, some unknown subset of which
                  may be faulty, that communicate only by means of
                  two-party messages. Each nonfaulty processor has a
                  private value of information that must be
                  communicated to each other nonfaulty
                  processor. Nonfaulty processors always communicate
                  honestly, whereas faulty processors may lie. The
                  problem is to devise an algorithm in which
                  processors communicate their own values and relay
                  values received from others that allows each
                  nonfaulty processor to infer a value for each other
                  processor. The value inferred for a nonfaulty
                  processor must be that processor's private value,
                  and the value inferred for a faulty one must be
                  consistent with the corresponding value inferred by
                  each other nonfaulty processor. It is shown that the
                  problem is solvable for, and only for, n>or=3m+1,
                  where m is the number of faulty processors and n is
                  the total number. It is also shown that if faulty
                  processors can refuse to pass on information but
                  cannot falsely relay information, the problem is
                  solvable for arbitrary n>or=m>or=0. This weaker
                  assumption can be approximated in practice using
                  cryptographic methods",
  url =		 "http://doi.acm.org/10.1145/322186.322188",
  studentname =	 "",
  summary =	 "",
  contribution1 ="",
  contribution2 ="",
  contribution3 ="",
  contribution4 ="",
  contribution5 ="",
  weakness1 =	 "",
  weakness2 =	 "",
  weakness3 =	 "",
  weakness4 =	 "",
  weakness5 =	 "",
  interesting =	 "high/med/low",
  opinions =	 "",
}


@Conference{Lampson96,
   author = "Lampson, B.W. ",
   title  = "How to build a highly available system using consensus",
   inbook = "Babaoglu, O. Marzullo, K. ",
   year = "1996",
   pages = "1-17",
   abstract = "Lamport showed that a replicated deterministic state machine is a general way to implement a highly available system, given a consensus algorithm that the replicas can use to agree on each input. His Paxos algorithm is the most fault-tolerant way to get consensus without real-time guarantees. Because general consensus is expensive, practical systems reserve it for emergencies and use leases (locks that time out) for most of the computing. This paper explains the general scheme for efficient highly available computing, gives a general method for understanding concurrent and fault-tolerant programs, and derives the Paxos algorithm as an example of the method",
   url    = "http://citeseer.nj.nec.com/lampson96how.html",
   studentname =   "",
   summary =   "",
   contribution1 =   "",
   contribution2 =   "",
   contribution3 =   "",
   contribution4 =   "",
   contribution5 =   "",
   weakness1 =   "",
   weakness2 =   "",
   weakness3 =   "",
   weakness4 =   "",
   weakness5 =   "",
   interesting =   "high/med/low",
   opinions =   "",
}

@article{Lamport98,
   author = "Lamport, T.",
   title  = "The part-time parliament",
   journal = "ACM Transactions on Computer Systems 16,",
   year = "1998",
   pages = "133-69",
   number = "2",
   abstract = "Recent archaeological discoveries on the island of Paxos reveal that the parliament functioned despite the peripatetic propensity of its part time legislators. The legislators maintained consistent copies of the parliamentary record, despite their frequent forays from the chamber and the forgetfulness of their messengers. The Paxon parliament's protocol provides a new way of implementing the state machine approach to the design of distributed systems",
   url    = "http://citeseer.nj.nec.com/lamport00parttime.html",
   studentname =   "",
   summary =   "",
   contribution1 =   "",
   contribution2 =   "",
   contribution3 =   "",
   contribution4 =   "",
   contribution5 =   "",
   weakness1 =   "",
   weakness2 =   "",
   weakness3 =   "",
   weakness4 =   "",
   weakness5 =   "",
   interesting =   "high/med/low",
   opinions =   "",
}

@article{Birman93,
   author = "Birman, K..P.",
   title  = "The process group approach to reliable distributed computing",
   journal = "Communications of the ACM 36,",
   year = "1993",
   pages = "36-53",
   number = "12",
   abstract = "One might expect the reliability of a distributed system to correspond directly to the reliability of its constituents, but this is not always the case. The mechanisms used to structure a distributed system and to implement cooperation between components play a vital role in determining the reliability of the system. Many contemporary distributed operating systems have placed emphasis on communication performance, overlooking the need for tools to integrate components into a reliable whole. The communication primitives supported give generally reliable behaviour, but exhibit problematic semantics when transient failures or system configuration changes occur. The resulting building blocks are, therefore, unsuitable for facilitating the construction of systems where reliability is important. This article reviews 10 years of research on ISIS, a system that provides tools to support the construction of reliable distributed software. The thesis underlying ISIS is that development of reliable distributed software can be simplified using process groups and group programming tools. This article describes the approach taken, surveys the system, and discusses experiences with real applications",
   url    = "http://doi.acm.org/10.1145/163298.163303",
   studentname =   "",
   summary =   "",
   contribution1 =   "",
   contribution2 =   "",
   contribution3 =   "",
   contribution4 =   "",
   contribution5 =   "",
   weakness1 =   "",
   weakness2 =   "",
   weakness3 =   "",
   weakness4 =   "",
   weakness5 =   "",
   interesting =   "high/med/low",
   opinions =   "",
}

@article{Poledna95,
   author = "Poledna, S.",
   title  = "Tolerating sensor timing faults in highly responsive hard real-time systems",
   journal = "IEEE Transactions on Computers 44,",
   year = "1995",
   pages = "181-91",
   number = "2",
   abstract = "Real-time systems that have to respond to environmental state changes within a very short latency period often use event-triggered task activation. If the system has to function correctly in the presence of sensor faults, event-triggered task activation is not reliable. Faulty sensors may cause task activations to occur too early, too late, or task activations are omitted entirely. In particular, early task activations can overload the system. Time-triggered task activation is reliable, but by defining a competitiveness ratio it is shown that the processor utilization for highly responsive tasks is unacceptably low. To overcome the problems of event-triggered task activation while preserving its good performance the task-splitting model is introduced. The task-splitting model integrates fault tolerance into the analysis and construction of hard real-time systems by using a combination of event-triggered and time-triggered task activation. Based on a general task model, it is independent of any particular scheduling algorithm. The result of this work has influenced the design of a new operating system which will be applied in a robust automotive engine controller of the next generation",
   url    = "http://ieeexplore.ieee.org/iel1/12/8353/00364530.pdf",
   studentname =   "",
   summary =   "",
   contribution1 =   "",
   contribution2 =   "",
   contribution3 =   "",
   contribution4 =   "",
   contribution5 =   "",
   weakness1 =   "",
   weakness2 =   "",
   weakness3 =   "",
   weakness4 =   "",
   weakness5 =   "",
   interesting =   "high/med/low",
   opinions =   "",
}

@article{Parker83,
   author = "Parker, D.S., Jr. ; Popek, G.J. ; Rudisin, G. ; Stoughton, A. ; Walker, B.J. ; Walton, E. ; Chow, J.M. ; Edwards, D. ; Kiser, S. ; Kline, C.",
   title  = "Detection of mutual inconsistency in distributed systems",
   journal = "IEEE Transactions on Software Engineering SE-9,",
   year = "1983",
   pages = "240-7",
   number = "3",
   abstract = "Many distributed systems are now being developed to provide users with convenient access to data via some kind of communications network. In many cases it is desirable to keep the system functioning even when it is partitioned by network failures. A serious problem in this context is how one can support redundant copies of resources such as files (for the sake of reliability) while simultaneously monitoring their mutual consistency (the equality of multiple copies). This is difficult since network failures can lead to inconsistency, and disrupt attempts at maintaining consistency. In fact, even the detection of inconsistent copies is a nontrivial problem. Naive methods either (1) compare the multiple copies entirely or (2) perform simple tests which will diagnose some consistent copies as inconsistent. Here a new approach, involving version vectors and origin points, is presented and shown to detect single file, multiple copy mutual inconsistency effectively. The approach has been used in the design of LOCUS, a local network operating system at UCLA",
   url    = "http://www.ece.cmu.edu/~ece749/papers/parker83_mutual_inconsistency.pdf",
   studentname =   "",
   summary =   "",
   contribution1 =   "",
   contribution2 =   "",
   contribution3 =   "",
   contribution4 =   "",
   contribution5 =   "",
   weakness1 =   "",
   weakness2 =   "",
   weakness3 =   "",
   weakness4 =   "",
   weakness5 =   "",
   interesting =   "high/med/low",
   opinions =   "",
}

@article{Davidson86,
   author = "Davidson, S.B. ; Garcia-Molina, H. ; Skeen, D.",
   title  = "Consistency in partitioned networks",
   journal = "Computing Surveys 17,",
   year = "1986",
   pages = "341-70",
   number = "3",
   abstract = "Recently, several strategies have been proposed for transaction processing in partitioned distributed database systems with replicated data. These strategies are surveyed in light of the competing goals of maintaining correctness and achieving high availability. Extensions and combinations are then discussed, and guidelines are presented for selecting strategies for particular applications",
   url    = "http://doi.acm.org/10.1145/5505.5508",
   studentname =   "",
   summary =   "",
   contribution1 =   "",
   contribution2 =   "",
   contribution3 =   "",
   contribution4 =   "",
   contribution5 =   "",
   weakness1 =   "",
   weakness2 =   "",
   weakness3 =   "",
   weakness4 =   "",
   weakness5 =   "",
   interesting =   "high/med/low",
   opinions =   "",
}