@inproceedings{TD:100391,
	att_abstract={Recently, there has been an increase interested for Web parallel
text for tasks such as machine translation and cross-language information
retrieval. Although previous
works have addressed many aspects of it, including
document pair selection, and sentence and word alignment, the
problem of discovering bilingual data sources in a large
scale has been overlooked to a great extent.
In this paper, we propose a novel crawling strategy to locate
bilingual sites which aims to achieve a balance between the
two conflicting requirements of this problem: the need to perform
a broad search while at the same time avoiding the need to crawl
unproductive Web regions. Our solution does so by focusing on
the graph neighborhood of bilingual sites and exploring 
the patterns of the links in this region to guide its visitation policy. 
To detect such sites, we introduce a two-step strategy that, first, relies on common patterns
found in the internal links of these sites to compose a classifier
that identifies candidate pages as entry points to parallel data in these sites,
and then, verifies whether these pages are in fact in the languages
of interest. Our experimental evaluation show that our crawler outperforms previous
crawling approaches for this task and produces a 
high-quality collection of bilingual sites.
},
	att_authors={lb091c, sb7658, vk947h},
	att_categories={C_IIS.3},
	att_copyright={AFNLP},
	att_copyright_notice={The definitive version was published in IJCNLP. {{, 2011-11-15}}

The definitive version was published in Very Large Databases, 2011. {{, 2011-11-15}}
},
	att_donotupload={},
	att_private={false},
	att_projects={},
	att_tags={},
	att_techdoc={true},
	att_techdoc_key={TD:100391},
	att_url={http://web1.research.att.com:81/techdocs_downloads/TD:100391_DS1_2011-09-01T21:06:52.493Z.pdf},
	author={Luciano Barbosa AND Srinivas Bangalore AND Vivek Kumar Rangarajan Sridhar},
	booktitle={Proceedings of IJCNLP},
	institution={{IJCNLP}},
	month={November},
	title={{Crawling Back and Forth: Using Back and Out Links to Locate Bilingual Sites}},
	year=2011,
}