@inproceedings{Parallel_semisup_Co12,
	att_abstract={The Web is an ever increasing, dynamically changing, multilingual repository of text. There have been several approaches to harvest this repository for bootstrapping, supplementing and adapting data needed for training models in speech and language applications. In this paper, we present semi-supervised and unsupervised approaches to harvesting multilingual text that rely on a key observation of link collocation. We demonstrate the effectiveness of our approach in the context of statistical machine translation by harvesting parallel texts and training translation models in 20 different languages. Furthermore, by exploiting the DOM trees of parallel webpages, we extend our harvesting technique to create parallel data for resource limited languages in an unsupervised manner. We also present some interesting observations concerning the socio-economic factors that the multilingual Web reflects.},
	att_authors={lb091c, vk947h, sb7658, ma718j},
	att_categories={},
	att_copyright={COLING},
	att_copyright_notice={The definitive version was published in  2012. {{, 2012-12-08}}
},
	att_donotupload={},
	att_private={false},
	att_projects={},
	att_tags={web crawling,  parallel text,  document model object tree,  machine translation},
	att_techdoc={true},
	att_techdoc_key={TD:100973},
	att_url={http://web1.research.att.com:81/techdocs_downloads/TD:100973_DS1_2013-01-30T21:27:59.320Z.pdf},
	author={Luciano Barbosa AND Vivek kumar Rangarajan sridhar AND Mahsa Yarmohammadi  AND Srinivas Bangalore},
	booktitle={Proceedings of COLING},
	institution={{COLING}},
	month={December},
	title={{Harvesting parallel text in multiple languages with limited supervision}},
	year=2012,
}