#!/usr/bin/env python # -*- coding: utf-8 -*- import wikipedia import pagegenerators import re import warnings from time import sleep from contextlib import closing from sys import stdout from json import dump, load from itertools import ifilter # This is required for the text that is shown when you run this script # with the parameter -help. docuReplacements = { '¶ms;': pagegenerators.parameterHelp } SITE = wikipedia.getSite() def pagesUsingTemplate(templateName): transclusionPageName = unicode(SITE.namespace(10)) + u":" + templateName transclusionPage = wikipedia.Page(SITE, transclusionPageName) gen = pagegenerators.ReferringPageGenerator(transclusionPage, onlyTemplateInclusion=True) return gen def has_disambiguator(page): return u'(' in page.title() def list_redirects_to(page): return page.getReferences(follow_redirects=False,redirectsOnly=True) def wordsRegex(words): return "(?:%s)" % ("|".join("(?:%s)" % word for word in words)) class CobraBot(object): EDIT_SUMMARY = u'Superfluous disambiguation removed per [[WP:NAMB]] ([[Wikipedia:BOTPOL#Assisted_editing_guidelines|assisted editing]] using [[User:CobraBot|CobraBot]]; [[User talk:Cybercobra]])' PERSON_SUMMARY = u'Person disambiguation tweaked ([[Wikipedia:BOTPOL#Assisted_editing_guidelines|assisted editing]] using [[User:CobraBot|CobraBot]]; [[User talk:Cybercobra]])' DABLINK = u"Dablink" DISAMBIGUATION = re.compile(u"\\{\\{[ \t]*" + wordsRegex("about dablink otheruses for the redirect this twootheruses".split() + ["other uses", "two other uses"]) +"[^}]*\\}\\}(\n?)", re.IGNORECASE) DB_MOVE = "{{db-move|%s|Evidently not ambiguous}}\n" OFFSET_FILE = 'N.json' def __init__(self, debug): """ Constructor. Parameters: * generator - The page generator that determines on which pages to work on. * debug - If True, doesn't do any real changes, but only shows what would have been changed. """ self.generator = ifilter(has_disambiguator, pagesUsingTemplate(self.DABLINK)) self.debug = debug self.editCount = 0 self.log = file("skipped.log", 'a') self.log.write("BEGIN NEW SESSION\n") wikipedia.setAction(self.EDIT_SUMMARY) def run(self): with closing(file(self.OFFSET_FILE, 'r')) as f: N = load(f) # Set the edit summary message print "Advancing by %s..." % N stdout.flush() for i in xrange(N): next(self.generator) print "Done advancing!" stdout.flush() try: for pageIndex, page in enumerate(self.generator): wikipedia.setAction(self.EDIT_SUMMARY) self.treat(page, pageIndex) finally: self.log.close() with closing(file(self.OFFSET_FILE, 'w')) as f: dump(N+pageIndex-5, f) ######### def treat(self, page, pageIndex): """ Loads the given page, does some changes, and saves it. """ print "==================================================================" print "PAGE TITLE:", page.title() print "PAGE#:", pageIndex+1 print "EDIT COUNT:", self.editCount if page.namespace() != 0: wikipedia.output(u"SKIPPING: Non-article namespace!") return try: # Load the page text = page.get() except wikipedia.NoPage: wikipedia.output(u"Page %s does not exist; skipping." % page.aslink()) return except wikipedia.IsRedirectPage: wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink()) return disams = list(re.finditer(self.DISAMBIGUATION, text)) if not disams: self.log.write("FALSE POSITIVE: "+page.title().encode('utf8')+"\n") print "FALSE POSITIVE:", page.title().encode('utf8') return print "REDIRECTS:" redirects = list(list_redirects_to(page)) print " ", "\n ".join([redirect.title() for redirect in redirects]) norm_with_caps = page.title().split(u"(")[0].strip() normalized_title = norm_with_caps.lower() if any(redir.title().lower() == normalized_title for redir in redirects): print "***PRIMARY TOPIC REDIRECTS HERE***" person = False dbmove = False while True: print "Choose option:" print "[0] Skip page" for i, disamb in enumerate(disams): lineno = text[:disamb.start()].count("\n") print "[%s] (line %s): %s" % (i+1, lineno, disamb.group().strip()) try: input = raw_input("Enter number of your choice: ") choice = int(input) except ValueError: if input == "person": person = True choice = 1 break if input == "dbmove": dbmove = True break print "Invalid input; try again." else: if choice <= len(disams): break else: print "Invalid input; try again." if dbmove: target = wikipedia.Page(SITE, norm_with_caps) text = self.DB_MOVE % page.title() + target.get() page = target elif choice == 0: print "SKIPPED" return else: redo = choice < 0 if choice < 0: choice = -choice choice -= 1 redact = disams[choice] if person: wikipedia.setAction(self.PERSON_SUMMARY) text = text[:redact.start()] + "{{otherpeople|%s}}\n" % norm_with_caps + text[redact.end():] else: text = text[:redact.start()] + text[redact.end():] # only save if something was changed if text != page.get(): # Show the title of the page we're working on. # Highlight the title in purple. wikipedia.output(u"\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) # show what was changed wikipedia.showDiff(page.get(), text) # raw_input("Continue?") # sleep(3) if dbmove or self.debug: choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N') if choice == 'n': return try: # Save the page page.put(text) except wikipedia.LockedPage: wikipedia.output(u"Page %s is locked; skipping." % page.aslink()) except wikipedia.EditConflict: wikipedia.output(u'Skipping %s because of edit conflict' % (page.title())) except wikipedia.SpamfilterError, error: wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url)) else: self.editCount += 1 if redo: self.treat(wikipedia.Page(SITE, page.title()), pageIndex) def main(): DEBUG = False bot = CobraBot(DEBUG) with warnings.catch_warnings(): warnings.simplefilter("ignore") bot.run() if __name__ == "__main__": try: main() finally: wikipedia.stopme()