Wikipedia:WikiProject Interlanguage Links/Scripts
Into a mysql database load the page and langlinks database dump files from two or more wikipedia Wikipedia:Database_download.
Extract interlanguage links into a single table for anaylysis
CREATE TABLE inter ( from_lang varchar(10), from_title varchar(255), to_lang varchar(10), to_title varchar(255) ); USE en; INSERT INTO inter.inter SELECT 'en', page.page_title, langlinks.ll_lang, langlinks.ll_title FROM page, langlinks WHERE page.page_id = langlinks.ll_from AND page.page_namespace = 0; USE de; INSERT INTO inter.inter SELECT 'de', page.page_title, langlinks.ll_lang, langlinks.ll_title FROM page, langlinks WHERE page.page_id = langlinks.ll_from AND page.page_namespace = 0; ... etc
Tidy up this table and index it:
update inter set from_title = replace( from_title, '_', ' ' ); update inter set to_title = replace( to_title, '_', ' ' ); // Clear nonsensical entries -- entries to nonsensical languages -- entries to blank titles -- entries to unlikely titles -- poss entries to nonexistant articles ? ALTER TABLE inter ADD INDEX ( from_lang, from_title ); ALTER TABLE inter ADD INDEX ( to_lang, to_title );
Extract the information required, ie:
// suggested reciporcal links // a:x -> b:x and b:x exists and no link from b:? to a:x exists => b:x -> a:x INSERT INTO suggestions SELECT a.to_lang, a.to_title, a.from_lang, a.from_title FROM inter a INNER JOIN en.page p ON p.page_title = a.to_title AND p.page_namespace = 0 AND p.page_is_redirect = 0 LEFT JOIN inter b ON b.from_lang = a.to_lang AND b.from_title = a.to_title AND b.to_lang = a.from_lang WHERE a.from_lang IN ('de','es','fr','it','nl','ja','pl','pt','sv' ) AND a.to_lang = 'en' AND b.from_lang IS NULL; // Interlanguage links to pages that do not exist // en -> fr only INSERT INTO suggestions SELECT a.from_lang, a.from_title, a.to_lang, a.to_title FROM inter a INNER JOIN en.page f ON f.page_title = a.from_title AND f.page_namespace = 0 AND f.page_is_redirect = 0 LEFT JOIN fr.page t ON t.page_title = a.to_title AND t.page_namespace = 0 WHERE a.from_lang = 'en' AND a.to_lang = 'fr' AND t.page_title IS NULL // Interlanguage links to redirects // en -> fr only SELECT a.from_lang, a.from_title, a.to_lang, a.to_title FROM inter a INNER JOIN en.page f ON f.page_title = a.from_title AND f.page_namespace = 0 AND f.page_is_redirect = 0 INNER JOIN fr.page t ON a.to_title = t.page_title AND t.page_namespace = 0 AND t.page_is_redirect = 1 WHERE a.from_lang = 'en' AND a.to_lang = 'fr';
Finally, extract these suggestions in a human-readable format
DROP PROCEDURE IF EXISTS report_suggestions; DELIMITER // CREATE PROCEDURE report_suggestions( group_size INT, flang VARCHAR(10), tlang VARCHAR(10) ) BEGIN DECLARE sug_pos, sug_base, done INT; DECLARE ftitle, ttitle VARCHAR(255); DECLARE sug CURSOR FOR SELECT DISTINCT from_title, to_title FROM suggestions WHERE from_lang = flang AND to_lang = tlang; DECLARE CONTINUE HANDLER FOR SQLSTATE '23000' SET done = 1; SET sug_pos = 0; SET sug_base = 0; SET done = 0; OPEN sug; REPEAT FETCH sug INTO ftitle, ttitle; IF NOT done THEN IF sug_pos = 0 THEN SELECT concat( '=== ', sug_base, ' - ', sug_base + group_size - 1, ' ===' ); SET sug_base = sug_base + group_size; SET sug_pos = group_size - 1; ELSE SET sug_pos = sug_pos - 1; END IF; SELECT concat( '*[[', ftitle, ']] → [[:', tlang, ':', ttitle, ']]' ); END IF; UNTIL done END REPEAT; CLOSE sug; END; // DELIMITER ; call report_suggestions( 10, 'en', 'sv' );