#!/usr/bin/python
# -*- coding: utf-8 -*-
import wikipedia as w
import time, re, pagegenerators, codecs, mysave, pickle, os
from datetime import date
# PotatoBot Task 2: Creates redirects from ISO 639 codes and ISO names to language articles;
# checks language infoboxes
tasklink = '[[w:Bots/Requests for approval/PotatoBot 2.2|Task 2]]'
rfromname = '{{R from alternative name}}'
redlist = u"""Languages with [[ISO 639-3]] codes that haven't got Wikipedia articles –
there might be articles under different names, though (%s). Date: %s.\n""" % (tasklink, mysave.fmtdate(date.today()))
def ISOredir(isopage, lang1page, lang2page, langlinks, part):
"""Try to create a redirect, and treat various special cases. Add {{R from ISO 639}} if necessary."""
# Initialise strings, show language in console window
global exclWrongRedirs
rfromISO = '{{R from ISO 639|%s%s}}' % (isopage.title()[8:].upper(), (part in (1, 5)) * ('|' + '%d' % part))
w.output('* ' + isopage.title() + ' -> ' + langlinks)
if isopage.exists():
if isopage.isRedirectPage():
try:
redirtarget = isopage.getRedirectTarget()
except:
if not isopage.title() in exclWrongRedirs:
w.output(' \03{yellow}invalid redirect?\03{default}')
return '# %s: not a valid redirect?\n' % isopage.aslink()
else:
return ''
if redirtarget in [lang1page] + (lang2page != None) * [lang2page]:
# {{R from ISO 639}} present?
redirtext = isopage.get(get_redirect=True)
if re.search(r'\{\{\s*[Rr] from ISO 639', redirtext):
if part not in (1, 5) or '|%d}}' % part in redirtext:
w.output(' redirect present and okay')
return ''
else:
w.output(' add {{R from ISO 639|...|part}}')
c = redirtext.find('from ISO 639') + 15 + (part == 5)
return mysave.savepage(isopage, redirtext[:c] + ('|%d' % part) + redirtext[c:], '2', \
'Redirect from [[ISO 639-%s]]' % part)
else:
w.output(' add {{R from ISO 639|...}}')
return mysave.savepage(isopage, redirtext + rfromISO, '2', 'Redirect from [[ISO 639]]')
elif not isopage.title() in exclWrongRedirs:
w.output(' \03{yellow}doesn\'t redirect to the right page?\03{default}')
return '# %s: redirects to %s, but the ISO list has %s\n'\
% (isopage.aslink(), isopage.getRedirectTarget().aslink(), langlinks)
else:
return ''
elif not isopage.title() in exclWrongRedirs:
w.output(' \03{yellow}not a redirect\03{default}')
return '# %s: not a redirect\n' % isopage.aslink()
else:
return ''
else:
result = ''
if lang1page.exists():
if part == 3 and lang1page != lang2page:
if lang2page.exists():
w.output(' \03{yellow}second possible target found (%s)\03{default}' % lang2page.title())
result = '# %s: another possible target language found, %s\n' % (isopage.aslink(), lang2page.aslink())
else:
w.output(' creating redirect %s' % lang2page.title())
result = mysave.makeredir(lang2page, lang1page, '2', rfromname)
w.output(' creating redirect %s' % isopage.title())
return result + mysave.makeredir(isopage, lang1page, '2', rfromISO)
elif part != 5 and lang2page.exists():
w.output(' creating redirect(s) %s, %s' % ((part == 3) * lang1page.title(), isopage.title()))
if part == 3:
result = mysave.makeredir(lang1page, lang2page, '2', rfromname)
return result + mysave.makeredir(isopage, lang2page, '2', rfromISO)
else:
return ''
def addInfobox(page, iso):
"""Create an {{infobox language}} and add it to page."""
# Todo: create infobox #
# text = page.get()
w.output(' \03{yellow}%s has no language infobox (could be added)\03{default}' % page.title())
return '# %s ([[ISO 639:%s]]) has no language infobox (could be added)\n' % (page.aslink(), iso)
# return mysave.savepage(page, text, BRFANo, 'Adding {{Infobox language}}')
def testdict(isopage, lang1page, lang2page, langlinks, langs = None):
"""Cross-check dictionary against existing redirects, create missing redirects and log possible problems."""
global redlist, exclTooMany, exclWrongboxes
# Todo: check whether lcn here = iso3 in target page of lln #
# Todo: check ISO 639-1 and -5 codes #
iso = isopage.title()[8:]
if lang1page.exists():
finalpage = lang1page
elif lang2page.exists():
finalpage = lang2page
else:
finalpage = None
if langs:
if len(langs) == 2:
if finalpage:
if finalpage.title() == langs[1]:
w.output(' infobox check okay')
return ''
elif not langs[1] in exclWrongBoxes:
w.output(' \03{yellow}"%s" is in %s instead of %s\03{default}'\
% (iso, langs[1], finalpage.title()))
return '# ISO code "%s" found in [[%s]] (but should be in %s per the ISO lists)\n'\
% (iso, langs[1], finalpage.aslink())
else:
return ''
else:
w.output(' creating redirects %s, %s, %s' %\
(isopage.title(), lang1page.title(), lang2page.title()))
redirto = w.Page(w.getSite(), langs[1])
result = ''
if lang1page != lang2page:
result = mysave.makeredir(lang2page, redirto, '2', langs[0] * rfromname)
return mysave.makeredir(lang1page, redirto, '2', langs[0] * rfromname) + result +\
mysave.makeredir(isopage, redirto, '2', '{{R from ISO 639|%s}}' % iso.upper())
else:
langsX = langs[1:]
for lang in langsX:
if lang in exclTooMany:
langsX.remove(lang)
else:
langs.remove(lang)
if len(langsX) > 0:
w.output(' \03{yellow}ISO code found more than once\03{default}')
return '# ISO code "%s" found more than once: in [[%s]]' % (iso, ']], [['.join(langsX)) +\
(len(langs) > 1) * (' (already present in [[%s]])' % ']], [['.join(langs[1:])) + '\n'
else:
return ''
elif finalpage:
if not finalpage.title() in exclWrongBoxes:
if 'Infobox language' in finalpage.templates() or 'Infobox Language' in finalpage.templates():
msg = 'a language infobox with a wrong code?'
elif not '#' in finalpage.title():
return addInfobox(finalpage, iso)
else:
msg = 'no language infobox'
w.output(' \03{yellow}%s has %s\03{default}' % (finalpage.title(), msg))
return '# %s (%s) has %s\n' % (finalpage.aslink(), isopage.aslink(), msg)
else:
return ''
else:
redlist += '# %s: %s\n' % (iso, langlinks)
w.output(' \03{purple}no target language found for %s\03{default}' % isopage.title())
return ''
def fmtLang(rawlang):
"""Catch linked (and possibly piped) entries, brackets and commas, etc."""
if rawlang == '':
return ''
search3a = re.search(r'\[\[(.*?)[\]\|]', rawlang)
if search3a:
lang = search3a.group(1).strip()
else:
lang = rawlang.strip()
if lang.lower().find('language') == -1 and lang != '(none)' and lang.lower() != 'undetermined':
par = lang.find('(')
comma = lang.find(', ')
if comma == -1:
comma = par - 1
lang = (((comma > -1 and par > -1) * lang[comma+2:par])\
+ ((comma > -1 and par == -1) * (lang[comma+2:] + ' '))\
+ (comma > -1) * lang[:comma]\
+ (comma <= -1) * lang\
+ ' language'\
+ ((par > -1 and not re.search(r'\d', lang[par:])) * (' ' + lang[par:]))).strip()
return lang
def DABsearch(dab, isotarget):
found, dabpage = False, w.Page(w.getSite(), dab)
exists = dabpage.exists() and dabpage.isDisambig()
if exists:
if dabpage.isRedirectPage():
found = dabpage.getRedirectTarget() == isotarget
else:
for link in dabpage.linkedPages():
if mysave.resolveredir(link) == isotarget:
found = True
break
return [exists * ('[[' + dab + ']]'), found]
def TLDabs(isopage):
"""Check two/three letter disambiguation pages."""
if isopage.exists():
iso = isopage.title()[8:].upper()
dabs = (iso, iso.lower(), iso + ' (disambiguation)', iso.lower() + ' (disambiguation)')
isotarget = mysave.resolveredir(isopage)
results = []
for dab in dabs:
results += DABsearch(dab, isotarget)
if True in results[1::2]:
break
if True in results[1::2]:
w.output(' dab %s okay' % iso)
return ''
else:
w.output(' \03{yellow}missing link from dab %s\03{default}' % iso)
return '# %s: %s\n' % (' '.join(results[::2]), isotarget.aslink())
else:
return ''
def loadTabs(file, name):
result, first = {}, True
with codecs.open(file, 'r', 'utf-8') as f:
for line in f:
if first:
first = False
else:
result[line[:3]] = [s.strip() for s in line[4:].split('\t')[:-1]]
w.output('%s entries loaded: %d' % (name, len(result)))
return result
def main():
global exclTooMany, exclWrongRedirs, exclWrongBoxes
# Prepare log
listout = ('Log for the creation of [[ISO 639-3]] redirects and checking of codes in '
'{{tl|Infobox language}} transclusions (%s). Date: %s.\n\n'
'<small>If you have checked an entry in this list and found it to be correct, '
'please add it to the [[User:PotatoBot/Excludes/Language articles|exclusion list]].</small>\n'\
% (tasklink, mysave.fmtdate(date.today())))
dablist = u'Three letter disambiguation pages missing ISO 639-3 code, or with a wrong ISO code (%s). Date: %s.\n' \
% (tasklink, mysave.fmtdate(date.today()))
# Load data from text files and excludes
w.output('')
SIL = loadTabs('data/SIL_tab.txt', 'SIL')
retired = loadTabs('data/retired.txt', 'Retired')
for code in retired:
if code not in SIL:
SIL[code] = ['', '', '', '', '', retired[code][0], '']
macro = loadTabs('data/macro.txt', 'Macrolanguage')
iso5 = loadTabs('data/iso5.txt', 'ISO 639-5')
exclWrongRedirs = [page.title() for page in pagegenerators.LinkedPageGenerator(w.Page(w.getSite(),\
'User:PotatoBot/Excludes/Language articles#WrongRedir'))]
exclWrongBoxes = [page.title() for page in pagegenerators.LinkedPageGenerator(w.Page(w.getSite(),\
'User:PotatoBot/Excludes/Language articles#WrongBox'))]
exclTooMany = [page.title() for page in pagegenerators.LinkedPageGenerator(w.Page(w.getSite(),\
'User:PotatoBot/Excludes/Language articles#TooMany'))]
w.output('\nExcludes loaded: %d wrong redir(s), %d wrong infobox(es), %d multiple codes'\
% (len(exclWrongRedirs), len(exclWrongBoxes), len(exclTooMany)))
# Create/load dictionary of ISO codes in language infoboxes
if os.path.isfile('data/isodict.pck'):
f = open('data/isodict.pck', 'r')
dict = pickle.load(f)
else:
dict = {}
params = ['iso3'] + ['lc%d' % (n+1) for n in range(99)]
for page in pagegenerators.ReferringPageGenerator(w.Page(w.getSite(), 'Template:Infobox language'),
onlyTemplateInclusion=True):
if page.namespace() == 0:
w.output('* page %s' % (page.aslink()))
for template in page.templatesWithParams():
if template[0].lower() == 'infobox language':
for param in template[1]:
value = param.partition('=')
code = value[2].strip()
if value[0].strip() in params and not code in ['', 'none']:
mainlang = value[0].strip() == 'iso3'
w.output(' > code "%s" found' % code + (not mainlang) * ' (dialect)')
if code not in dict:
dict[code] = [mainlang]
if (dict[code][0] == mainlang) or (page.title() in dict[code]):
if page.title() in dict[code]:
dict[code][0] = True
dict[code].append(page.title())
elif mainlang:
dict[code] = [True, page.title()]
f = open('data/isodict.pck', 'w')
pickle.dump(dict, f)
w.output('\nLanguage infoboxes loaded: %d' % len(dict))
regex1 = re.compile(r'^!(.*)\{\{\s*[Aa]nchor\\s*|\s*([a-z]{3})\s*}}')
regex2 = re.compile(r'^!(.*)\[\[(.*)\|([a-z]{3})\]\]')
regex3 = re.compile(r'^\|(.*?)\|\|(.*?)\|\|(.*?)\|\|(.*?)\|\|(.*?)\|\|(.*?)($|\|\|)')
# Create article list and run
for a in range(97, 123):
abclist = w.Page(w.getSite(), 'ISO 639:'+chr(a))
lines = abclist.get().splitlines(True)
# Include Ethnologue and SIL data, update tables
lineNo, alphaerror = 1, False
for i in range(26 ** 2):
code = chr(a) + chr(i / 26 + 97) + chr(i % 26 + 97)
while lineNo < len(lines) - 1 and (lines[lineNo-1][:2] not in ('|-', '|}') or '...' in lines[lineNo]):
lineNo += 1
if '{{anchor|%s}}' % code not in lines[lineNo] and code in SIL:
lines[lineNo-1:lineNo-1] = ['|-\n', '!%s {{anchor|%s}}\n' % (code, code), 10 * '| |' + '|\n']
if lineNo < len(lines) - 1 and '{{anchor|%s}}' % code in lines[lineNo]:
if code in SIL:
if '[[' not in lines[lineNo] and code not in ('mis', 'mul', 'und', 'zxx'):
lines[lineNo] = '![[%s|%s]] {{anchor|%s}}\n' % (fmtLang(SIL[code][5]), code, code)
search = regex3.search(lines[lineNo+1])
if search.group(6).strip() == '':
lines[lineNo+1] = lines[lineNo+1][:search.start(6)] + SIL[code][5] + lines[lineNo+1][search.end(6):]
search = regex3.search(lines[lineNo+1])
lines[lineNo+1] = lines[lineNo+1][:search.start(1)] + (SIL[code][2] or ' ') + \
lines[lineNo+1][search.end(1):search.start(2)] + (SIL[code][0] or ' ')+ \
lines[lineNo+1][search.end(2):]
if code not in retired:
search = regex3.search(lines[lineNo+1])
lines[lineNo+1] = lines[lineNo+1][:search.start(3)] + SIL[code][3] + '/' + SIL[code][4] + \
lines[lineNo+1][search.end(3):]
elif '!(' not in lines[lineNo]:
b1 = lines[lineNo].find('[[')
b2 = lines[lineNo].find(']]') + 2
lines[lineNo] = '!(' + lines[lineNo][b1:b2] + ')' + lines[lineNo][b2:]
search = regex3.search(lines[lineNo+1])
scopetype = lines[lineNo+1][search.start(3):search.end(3)]
lines[lineNo-1] = '|-' + (len(scopetype) == 3 and scopetype != 'I/L') * \
('{{ISO 639-3 style|%s|%s}}' % (scopetype[0:1], scopetype[2:3])) + '\n'
lineNo += 1
while lineNo < len(lines) - 1:
lineNo += 1
if '{{anchor|' in lines[lineNo].lower():
alphaerror = True
text = ''.join(lines)
if alphaerror:
lines = abclist.get().splitlines(True)
w.output(' \03{yellow}%s not sorted alphabetically: using old table\03{default}' % abclist.title())
listout += u'# List %s does not seem to be sorted alphabetically – using old table\n' % abclist.aslink()
if text != abclist.get():
listout += mysave.savepage(abclist, text, '2.2', 'Update, wikilinks')
# Create iso, lang1, lang2
for n in range(len(lines)):
iso, iso1, lang1, lang2 = '', '', '', ''
search1 = regex1.search(lines[n])
search2 = regex2.search(lines[n])
if search1:
iso = search1.group(2)
elif search2:
iso = search2.group(3)
if lines[n][0:1] == '!' and n < len(lines)-1:
search3 = regex3.search(lines[n+1])
if search3:
lang2 = fmtLang(search3.group(6))
iso1 = search3.group(1).strip()
if search2:
lang1 = search2.group(2).strip()
if not lang2:
lang2 = lang1
else:
p = lang2.find('(')
if p > 0:
lang1 = (lang2[p+1:-1] + ' ' + lang2[:p-1]).strip()
else:
lang1 = lang2
# If a language is found, create redirects; log problems
if iso != '' and lang1 != '' and lang2 != '':
w.output('')
isopage = w.Page(w.getSite(), 'ISO 639:' + iso)
iso1page = w.Page(w.getSite(), 'ISO 639:' + iso1)
lang1page = mysave.resolveredir(w.Page(w.getSite(), lang1))
lang2page = mysave.resolveredir(w.Page(w.getSite(), lang2))
if lang1page.exists() and lang1page.isDisambig():
lang1page = lang2page
if lang2page.exists() and lang2page.isDisambig():
lang2page = lang1page
langlinks = lang1page.aslink() + (lang1page.title() != lang2page.title()) * (' / ' + lang2page.aslink())
if not (lang2page.exists() and lang2page.isDisambig()):
listout += ISOredir(isopage, lang1page, lang2page, langlinks, 3)
dablist += TLDabs(isopage)
if iso1 != '':
listout += ISOredir(iso1page, lang1page, lang2page, langlinks, 1)
dablist += TLDabs(iso1page)
listout += testdict(isopage, lang1page, lang2page, langlinks, dict.pop(iso, None))
else:
w.output(' \03{red}only disambigs found for code %s\03{default}' % iso)
listout += '# Only disambiguation pages found for code %s\n' % iso
elif lines[n][0:1] == '!':
w.output(' \03{red}could not parse "%s" in %s\03{default}' % (lines[n].strip(), abclist.aslink()))
listout += '# Could not parse line "<nowiki>%s</nowiki>" in %s\n' % (lines[n].strip(), abclist.aslink())
# ISO 639-5 redirects
for code in iso5:
iso5page = w.Page(w.getSite(), 'ISO 639:' + code)
listout += ISOredir(iso5page, mysave.resolveredir(w.Page(w.getSite(), iso5[code][0])), None, iso5[code][0], 5)
dablist += TLDabs(iso5page)
# Log invalid codes
for item in dict:
listout += '# Code "%s" in [[%s]] not listed\n' % (item, ']], [['.join(dict[item][1:]))
# Output logs
w.output('')
mysave.savepage(w.Page(w.getSite(), 'User:PotatoBot/Lists/ISO 639 log'), listout, '2.2', 'Creating [[ISO 639-3]] log')
mysave.savepage(w.Page(w.getSite(), 'User:PotatoBot/Lists/ISO 639 language articles missing'), redlist, '2.2',
'Creating list of missing [[ISO 639-3]] language articles')
mysave.savepage(w.Page(w.getSite(), 'User:PotatoBot/Lists/Dabs without ISO 639 codes'), dablist, '2.2',
'Creating list of missing [[ISO 639-3]] codes in disambiguation pages')
if __name__ == "__main__":
try:
main()
finally:
w.stopme()