; Required Readings ; Reminder, please submit these no later than ** 10 pm Tuesday night **, thanks @inproceedings{ maffeis95adding, author = "Silvano Maffeis", title = "Adding Group Communication and Fault-Tolerance to {CORBA}", pages = "135--146", year = "1995", abstract = "Groupware and fault-tolerant distributed systems stimulate the need for structuring activities around objectgroups and reliable multicast communication. The objectgroup abstraction permits to treat a collection of networkobjects as if they were a single object; clients can invoke operations on object-groups without needing to know the exact membership of the group. Object-groups mainly serve to increase reliability through replication, performance through parallelism, or to distribute data from ...", url = "citeseer.nj.nec.com/maffeis95adding.html", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @inproceedings{ felber96design, author = "P. Felber and B. Garbinato and R. Guerraoui", title = "The Design of a {CORBA} Group Communication Service", booktitle = "Proceedings of the 15th Symposium on Reliable Distributed Systems ({SRDS}-15)", address = "Niagara-on-the-Lake, Canada", pages = "150--159", year = "1996", abstract = "The Common Object Request Broker Architecture (CORBA) is becoming a middleware standard for distributed application development, and there are increasing needs in enriching the basic functionalities of CORBA. Whereas mechanisms for persistence, transactions, event channels, etc., have been designed and specified for CORBA, no support is provided to handle object replication. In this paper we discuss the issue of augmenting CORBA with group communication, which is considered an adequate paradigm ...", url = "citeseer.nj.nec.com/felber96design.html", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } @inproceedings{ narasimhan02_ftcorba_lessons, author = "Narasimhan, P.; Moser, L.E.; Melliar-Smith, P.M.; ", title = "Lessons Learned in Building a Fault-Tolerant CORBA system", booktitle = "Proceedings of the International Conference on Dependable Systems and Networks", pages = "39--44", year = "2002", abstract = "The Eternal system pioneered the interception approach to providing transparent fault tolerance for CORBA, which allows it to make a CORBA application reliable with little or no modification to the application or the ORB. The design and implementation of the Eternal system has influenced industrial practices by providing the basis for the specifications of the Fault-Tolerant CORBA standard that the Object Management Group adopted. In this paper, we discuss our experience in developing the Eternal system, with particular emphasis on the challenges that we encountered and the lessons that we learned.", url = "http://www.ece.cmu.edu/~ece749/papers/narasimhan02_ftcorba_lessons.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", } ; Supplemental Reading @Conference{Merlin78, author = "Merlin, P.M. ; Randell, B. ", title = "State restoration in distributed systems", inbook = "FTCS-8. The Eighth Annual International Conference on Fault-Tolerant Computing", year = "1978", pages = "129-34", abstract = "This paper concerns an important aspect of the problem of designing fault-tolerant distributed computing systems. The concepts involved in `backward error recovery', i.e. restoring a system, or some part of a system, to a previous state which it is hoped or believed preceded the occurrence of any existing errors are formalised, and generalised so as to apply to concurrent, e.g. distributed, systems. Since in distributed systems there may exist a great deal of independence between activities, the system can be restored to a state that could have existed rather than to a state that actually existed. The formalisation is based on the use of what is termed `Occurrence Graphs' to represent the cause-effect relationships that exist between the events that occur when a system is operational, and to indicate existing possibilities for state restoration. A protocol is presented which could be used in each of the nodes in a distributed computing system in order to provide system recoverability in the face even of multiple faults", url = "http://ieeexplore-beta.ieee.org//iel3/3846/11214/00532636.pdf", studentname = "", summary = "", contribution1 = "", contribution2 = "", contribution3 = "", contribution4 = "", contribution5 = "", weakness1 = "", weakness2 = "", weakness3 = "", weakness4 = "", weakness5 = "", interesting = "high/med/low", opinions = "", }