; Hand this in to: ece849-staff+hw@ece.cmu.edu @article{mitra05_soft_error_resilience, author = "Mitra, S.; Seifert, N.; Zhang, M.; Shi, Q.; Kim, K.S.", title = "Robust system design with built-in soft-error resilience", journal = "IEEE Computer", year = "2005", pages = "43-52", number = "2", volume = "38", abstract = "Transient errors caused by terrestrial radiation pose a major barrier to robust system design. A system's susceptibility to such errors increases in advanced technologies, making the incorporation of effective protection mechanisms into chip designs essential. A new design paradigm reuses design-for-testability and debug resources to eliminate such errors.", url = "http://ieeexplore.ieee.org/iel5/2/30429/01401773.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{reibman91_reliability_modeling, author = "A. Reibman and M. Veeraraghavan", affiliation = "Bell Labs", title = "Reliability modeling: an overview for system designers", organization = "Bell Labs", year = "1991", volume = "24", number = "4", pages = "49--57", abstract = "The role of reliability models in system design is examined. Methods for predicting system reliability are discussed, covering the choice of metric for analysis, creating the system reliability model, and refining the model. A case study is presented to illustrate reliability modeling.", url = "http://ieeexplore.ieee.org/iel1/2/2541/00076262.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @inbook{dugan95_dependability, author = "Joanne Bechta Dugan and Michael R. Lyu", affiliation = "Bell Labs", title = "Software Fault Tolerance", editor = "Lyu", organization = "University of Virginia, VA, USA", year = "1995", chapter = "5", pages = "109--138", abstract = " Three major fault-tolerant software system architectures, distributed recovery blocks, N-version programming, and N self-checking programming, are modeled by a combination of fault tree techniques and Markov processes. In these three architectures, transient and permanent hardware faults as well as unrelated and related software faults are modeled in the system-level domain. The model parameter values are determined from the analysis of data collected from a fault-tolerant avionic application. Quantitative analyses for reliability and safety factors achieved in these three fault-tolerant system architectures are presented.", url = "http://www.ece.cmu.edu/~ece849/papers/dugan95_depend_modeling.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{schlichting83_failstop, author = " Richard D. Schlichting and Fred B. Schneider", title = "Fail-stop processors: an approach to designing fault-tolerant computing systems", journal = "Computer Systems", volume = "1", number = "3", pages = "222-238", year = "1983", abstract = "A methodology that facilitates the design of fault-tolerant computing systems is presented. It is based on the notion of a failstop processor. Such a processor automatically halts in response to any internal failure and does so before the effects of that failure become visible. The problem of implementing processors that, with high probability, behave like fail-stop processors is addressed. Axiomatic program verification techniques are described for use in developing provably correct programs for failstop processors. The design of a process control system illustrates the use of our methodology.", url = "http://citeseer.ist.psu.edu/schlichting83failstop.html", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } ; Supplemental Readings @Conference{bossen81_edfi, author = "D. Bossen and M. Hsiao", affiliation = "IBM, USA", title = "ED/FI: A Technique for Improving Computer System RAS", booktitle = "Fault-Tolerant Computing 1995, Highlights from Twenty-Five Years", organization = "FTCS", year = "1995", abstract = "ED/FI (error detection and fault isolation is a model for projecting the ability of a computer system to dynamically detect hardware errors during normal operation, and to automatically isolate the fault causing the error based only on information captured at the time the error is detected. This general approach to fault isolation solves the difficult problem of intermittent fault diagnosis based on testing. This model has been used to project the error detection and fault isolation characteristics of a number of products, and experimental results show good correlation with the model's projections.", url = "http://ieeexplore.ieee.org/iel3/3846/11214/00532644.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "generally low ratings in 2005", } @Conference{bouricius71_reliability, author = "W. Bouricius., W. Carter, D. Jessep, P. Schneider, & A. Wadia", affiliation = "IBM, USA", title = "Reliability modeling for fault tolerant computers", booktitle = "Fault-Tolerant Computing 1995, Highlights from Twenty-Five Years", organization = "FTCS", year = "1995", abstract = "Reliability modeling and the mathematical equations involved are discussed for general computer systems organized to be fault-tolerant. This paper summarizes the work done over the last four years on mathematical reliability modeling by the authors.", url = "http://ieeexplore.ieee.org/iel3/3846/11214/00532626.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Sahner87, author = "Sahner, R.A. ; Trivedi, K.S.", title = "Reliability modeling using SHARPE", journal = "IEEE Transactions on Reliability R-36,", year = "1987", pages = "186-93", number = "2", abstract = "The authors present an approach for avoiding the large state-space problem. The approach uses a hierarchical modeling technique for analyzing complex reliability models. It allows the flexibility of Markov models where necessary and retains the efficiency of combinatorial solution where possible. Based on this approach, a computer program called SHARPE (symbolic hierarchical automated reliability and performance evaluator) has been written. The hierarchical modeling technique provides a very flexible mechanism for using decomposition and aggregation to model large systems; it allows for both combinatorial and Markov or semi-Markov submodels, and can analyze each model to produce a distribution function. The choice of the number of levels of models and the model types at each level is left up to the modeler. Component distribution functions can be any exponential polynomial whose range is between zero and one. Examples show how combinations of models can be used to evaluate the reliability and availability of large systems using SHARPE", url = "http://www.ece.cmu.edu/~ece749/papers/sahner87_sharpe.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Barbara87, author = "Barbara, D. ; Garcia-Molina, H.", title = "The reliability of voting mechanisms", journal = "IEEE Transactions on Computers C-36,", year = "1987", pages = "1197-208", number = "10", abstract = "In a faulty distributed system, voting is commonly used to achieve mutual exclusion among groups of isolated nodes. Each node is assigned a number of votes, and any group with a majority of votes can perform the critical operations. The problem of selecting vote assignments in order to maximize the probability that the critical operations can be performed at a given time by some group of nodes is addressed. Simple heuristics to assign votes are suggested, and it is shown that they give good results in most cases. Three particular homogeneous topologies (fully connected, Ethernet, and ring networks) are studied, and analytical expressions for system reliability are derived that provide useful insights into the reliability provided by voting mechanisms", url = "http://www.ece.cmu.edu/~ece749/papers/barbara87_voting_reliability.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @Conference{Malhis95, author = "Malhis, L.M. ; Sanders, W.H. ; Schlichting, R.D. ", title = "Numerical evaluation of a group-oriented multicast protocol using stochastic activity networks", inbook = "Proceedings of the Sixth International Workshop on Petri Nets and Performance Models ", year = "1995", pages = "63-72", abstract = "Group-oriented multicast protocols that provide message ordering and delivery guarantees are becoming increasingly important in distributed system design. However, despite the large number of such protocols, little analytical work has been done concerning their performance, especially in the presence of message loss. This paper illustrates a method for determining the performability of group-oriented multicast protocols using stochastic activity networks, a stochastic extension to Petri nets, and reduced base model construction. In particular, we study the performability of one such protocol, called Psync, under a wide variety of workload and message loss probabilities. The specific focus is on measuring two quantities, the stabilization time-that is, the time required for messages to arrive at all hosts-and channel utilization. The analysis shows that Psync works well when message transmissions are frequent, but exhibits extremely long message stabilization times when transmissions are infrequent and message losses occur. The results provide useful insight on the behavior of Psync, as well as serve as a guide for evaluating the performability of other group-oriented multicast protocols", url = "http://ieeexplore.ieee.org/iel3/4022/11544/00524316.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Rai87, author = "Rai, S. ; Sarje, A.K. ; Prasad, E.V. ; Kumar, A.", title = "Two recursive algorithms for computing the reliability of k-out-of-n systems", journal = "IEEE Transactions on Reliability R-36,", year = "1987", pages = "261-5", number = "2", abstract = "The authors present two recursive methods to compute reliability of a k-out-of-n system. The method is simple and computationally efficient when compared with other current methods. Examples illustrate the technique. The algorithms are presented in a recursive language with an Algol-like notation. The algorithms are easy to remember and can be used for manual computations", url = "http://www.ece.cmu.edu/~ece749/papers/rai87_k_of_n_reliability.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Abraham74, author = "Abraham, J.A. ; Siewiorek, D.P.", title = "An algorithm for the accurate reliability evaluation of triple modular redundancy networks", journal = "IEEE Transactions on Computers C-23,", year = "1974", pages = "682-92", number = "7", abstract = "There are several instances where the classical method of triple-modular redundancy (TMR) reliability modeling may provide predictions which are inadequate. It is shown that for even simple networks such as those exhibiting fan in and fan out, classical methods may predict a reliability that is higher than or lower than the actual reliability. Furthermore, the classical method gives no hint as to whether the predicted number is high or low. As a solution to this problem, a method of partitioning an arbitrary network into cells such that faults in a cell are independent of faults in other cells is proposed. An algorithm is then given to calculate the reliability of any such cell, by considering only the structure of the interconnections within the cells. the value of the reliability found is exact if TMR is assumed to be a coherent system. An approximation to the algorithm is also described; this can be used to find a lower", url = "http://www.ece.cmu.edu/~ece749/papers/abraham74_tmr_evaluation.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Geist90, author = "Geist, R. ; Trivedi, K.S.", title = "Reliability estimation of fault-tolerant systems: tools and techniques", journal = "Computer 23,", year = "1990", pages = "52-61", number = "7", abstract = "A comparative evaluation of state-of-the-art tools and techniques for estimating the reliability of fault-tolerant computing systems is presented. The theory of reliability estimation is briefly reviewed. Five current approaches are compared in detail: HARP (hybrid automated reliability predictor), SURE (semi-Markov unreliability range estimator), HEIRESS (hierarchical estimation of interval reliability by skewed sampling), SHARPE (symbolic hierarchical automated reliability and performance evaluator), and SAVE (system availability estimator). Particular attention is given to design limitations imposed by underlying model assumptions, on the one hand, and the efficiency and accuracy of the solution techniques employed, on the other hand", url = "http://ieeexplore-beta.ieee.org//iel1/2/2058/00056852.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @article{Cullyer89, author = "Cullyer, W.J.", title = "Implementing high integrity systems: the VIPER microprocessor", journal = "IEEE Aerospace and Electronics Systems Magazine 4,", year = "1989", pages = "5-13", number = "6", abstract = "The author describes the development of VIPER and points out some of the practical problems encountered over the four years of the project. Informal proofs of correctness, carried out in the early stages of the project, are outlined. A peer review group criticized the lack of multiplication and division instructions in VIPER 1. This deficiency is corrected in VIPER 2, and the performance is increased to 3 MIPs", url = "http://ieeexplore.ieee.org/iel2/761/492/00009638.pdf", studentname = "", summary = "", contribution1 ="", contribution2 ="", contribution3 ="", contribution4 ="", contribution5 ="", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", }