@techreport{TD:100047,
	att_abstract={{Automatic system monitoring and recovery has the potential to provide effective, low-cost ways to improve dependability in distributed software systems. However, automating recovery is challenging in practice because accurate fault diagnosis is difficult given the common monitoring tools and techniques with low fault coverage, poor fault localization, detection delays, and false positives. In this paper, we present a holistic model-based approach that overcomes these challenges and enables automatic recovery in distributed systems. To do so, it uses theoretically sound techniques including Bayesian estimation and Markov decision theory to provide controllers that choose good, if not optimal, recovery actions according to a user-defined optimization criteria. By combining monitoring and recovery, the approach realizes benefits that could not have been obtained by using them in isolation. We experimentally validate our framework by fault injection on realistic e-commerce systems.}},
	att_authors={kj2681, mh7921, rs2497},
	att_categories={C_NSS.3, C_NSS.4, C_NSS.5, C_CCF.8},
	att_copyright={{IEEE}},
	att_copyright_notice={{}},
	att_donotupload={true},
	att_private={false},
	att_projects={},
	att_tags={Fault tolerance,  Monitoring,  Diagnosis,  Recovery,  Distributed systems,  Adaptive systems,  POMDP,  Bayesian},
	att_techdoc={true},
	att_techdoc_key={TD:100047},
	att_url={},
	author={Kaustubh Joshi and Matti Hiltunen and Richard Schlichting and William Sanders},
	institution={{IEEE Transactions on Dependable and Secure Computing}},
	month={June},
	title={{Probabilistic Model-Driven Recovery in Distributed Systems}},
	year=2010,
}