# -*- coding: utf-8 -*- import wikipedia import pagegenerators import re import warnings from time import sleep from sys import stdout from oclc import isbn2oclc # This is required for the text that is shown when you run this script # with the parameter -help. docuReplacements = { '¶ms;': pagegenerators.parameterHelp } TEMPLATE_PREFIX = u"Template:" SITE = wikipedia.getSite() def pagesUsingTemplate(templateName): transclusionPageName = unicode(SITE.namespace(10)) + u":" + templateName transclusionPage = wikipedia.Page(SITE, transclusionPageName) gen = pagegenerators.ReferringPageGenerator(transclusionPage, onlyTemplateInclusion=True) return gen class BailOut(StandardError): """Immediately stop processing the current page""" class OCLCBot: # Edit summary message that should be used. EDIT_SUMMARY = u'Adding [[OCLC]]# to book infobox based on [[ISBN]] ([[User:CobraBot|CobraBot]]; PLEASE [[User talk:CobraBot|report any problems]])' BOOK_INFOBOX = u"Infobox Book" DASHES = [u'-', u'‒', u'–', u'—', u'―'] TERMINATOR = re.compile(u"(}})|\\|") INFOBOX_START = re.compile(u"\\{\\{[ \t\n]*infobox[ _]((book(s)?)|(novel))", re.IGNORECASE) OCLC_PARAM = u"\\|[ \t\n]*oclc[ \t\n]*=[ \t\n]*" ISBN_MIN_LEN = 10 def __init__(self, debug): """ Constructor. Parameters: * generator - The page generator that determines on which pages to work on. * debug - If True, doesn't do any real changes, but only shows what would have been changed. """ self.generator = pagesUsingTemplate(self.BOOK_INFOBOX) self.debug = debug self.editCount = 0 self.log = file("skipped.log", 'a') def run(self): N = 371+145+36+29+38+26+48+56+48+188+85+45+171+130+105 # Set the edit summary message wikipedia.setAction(self.EDIT_SUMMARY) print "Advancing by %s..." % N stdout.flush() for i in xrange(N): next(self.generator) print "Done advancing!" stdout.flush() for pageIndex, page in enumerate(self.generator): self.treat(page, pageIndex) self.log.close() ######### def partition(self, text): boxmatch = self.INFOBOX_START.search(text) if not boxmatch: wikipedia.output(u"SKIPPING: Page either uses 'Book infobox' alias or is false positive") raise BailOut, "SKIPPING: Page either uses 'Book infobox' alias or is false positive" boxStart = boxmatch.start() boxEnd = boxStart + re.search(u"\\}\\}", text[boxStart:]).end() prebox = text[:boxStart] box = text[boxStart:boxEnd] postbox = text[boxEnd:] return prebox, box, postbox def checkForOclc(self, box): paramMatch = re.search(self.OCLC_PARAM, box) if paramMatch: #has |oclc= oclcValAndRest = box[paramMatch.end():] oclcTermMatch = self.TERMINATOR.search(oclcValAndRest) value = oclcValAndRest[:oclcTermMatch.start()].strip() # | oclc = VALUE | if value: #already has |oclc= filled in wikipedia.output(u"SKIPPING: oclc param already filled") raise BailOut, "SKIPPING: oclc param already filled" else: #remove the |oclc= # print "REMOVED OCLC:", repr(paramMatch.group()) box = box[:paramMatch.start()] + box[paramMatch.start()+len(paramMatch.group()):] # print "NEW BOX:" # print box return box return box def findIsbnVal(self, box): paramMatch = re.search(u"\\|([ \t\n])*isbn([ \t\n])*=([ \t\n])*", box) if not paramMatch: #no ISBN present wikipedia.output(u"SKIPPING: No isbn param present") raise BailOut, "SKIPPING: No isbn param present" isbnValAndRest = box[paramMatch.end():] termMatch = self.TERMINATOR.search(isbnValAndRest) isbnVal = isbnValAndRest[:termMatch.start()] relIsbnTerm = self.TERMINATOR.search(isbnValAndRest).start() isbnTerm = paramMatch.end() + relIsbnTerm isbnFrag = isbnValAndRest[:relIsbnTerm] if '[[' in isbnFrag and ']]' not in isbnFrag: wikipedia.output(u"SKIPPING: Piped wikilink in |isbn= field; bot too stupid to handle") raise BailOut, "SKIPPING: Piped wikilink in |isbn= field; bot too stupid to handle" return isbnVal, isbnTerm def removeDashes(self, isbn): for dash in self.DASHES: isbn = isbn.replace(dash, '') return isbn def checkForNA(self, isbn): if re.match(u"N/?A", isbn, re.IGNORECASE): wikipedia.output(u"SKIPPING: ISBN Not/Applicable") raise BailOut, "SKIPPING: ISBN Not/Applicable" def removeExtraISBN(self, isbnVal): match = re.match(u"([ \t\n])*ISBN([ \t\n])*", isbnVal) if match: return isbnVal[match.end():] return isbnVal def firstWord(self, isbnVal): wordMatch = re.search("[^ \t\n<,;\\[\\]]+", isbnVal) return wordMatch.group() def normalize(self, string): return string.replace(u' ',u'').replace(u"-",u'').replace(u"and", u"&").replace(u',', u'').replace(u'.', u'').replace(u"'", u'').replace(u'"', u'').replace(u"’", u'').lower().replace(u"the", u'') def treat(self, page, pageIndex): """ Loads the given page, does some changes, and saves it. """ print "==================================================================" # if u"British" not in page.title(): return # raw_input("Continue?") print "PAGE TITLE:", page.title() print "PAGE#:", pageIndex+1 print "EDIT COUNT:", self.editCount if page.namespace() != 0: wikipedia.output(u"SKIPPING: Non-article namespace!") return try: # Load the page text = page.get() except wikipedia.NoPage: wikipedia.output(u"Page %s does not exist; skipping." % page.aslink()) return except wikipedia.IsRedirectPage: wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink()) return ################################################################ # NOTE: Here you can modify the text in whatever way you want. # ################################################################ # If you find out that you do not want to edit this page, just return. try: prebox, box, postbox = self.partition(text) # print "BOX:" # print box box = self.checkForOclc(box) isbnVal, isbnTerm = self.findIsbnVal(box) # print "INITIAL ISBN:", repr(isbnVal) isbnVal = self.removeDashes(isbnVal).strip() # print "ISBN SANS DASH:", repr(isbnVal) isbnVal = self.removeExtraISBN(isbnVal) self.checkForNA(isbnVal) # print "ISBN SANS ISBN:", repr(isbnVal) if not isbnVal: #empty |isbn= wikipedia.output(u"SKIPPING: Empty isbn param") raise BailOut, "SKIPPING: Empty isbn param" isbn = self.firstWord(isbnVal) # print "ONE TRUE ISBN:", isbn print "ISBN#:", isbn if len(isbn) < self.ISBN_MIN_LEN: wikipedia.output(u"SKIPPING: Malformed ISBN, too short (%s)" % isbn) raise BailOut, ("SKIPPING: Malformed ISBN, too short (%s)" % isbn) if not re.search("[0-9]", isbn): wikipedia.output(u"SKIPPING: Malformed ISBN, no numbers (%s)" % isbn) raise BailOut, ("SKIPPING: Malformed ISBN, no numbers (%s)" % isbn) except BailOut as e: self.log.write(page.title().encode('utf8')+"; "+e.message+"\n") return #do lookup try: oclc, oclcTitle = isbn2oclc(isbn) except RuntimeError as e: wikipedia.output(u"ABORTED: Problem looking up OCLC# (%s)" % e.message) return print "PAGE TITLE:", page.title() wikiCanon = self.normalize(page.title().split(u"(")[0]) oclcCanon = self.normalize(oclcTitle.split(u":")[0]) titlesMatch = oclcCanon.startswith(wikiCanon) if titlesMatch: print print "--Canonical titles DO MATCH.--" else: print wikiCanon print oclcCanon box = box[:isbnTerm] + "| oclc= "+oclc+(" " if self.debug else "\n") + box[isbnTerm:] text = prebox + box + postbox # only save if something was changed if text != page.get(): # Show the title of the page we're working on. # Highlight the title in purple. wikipedia.output(u"\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) # show what was changed wikipedia.showDiff(page.get(), text) # raw_input("Continue?") # sleep(3) if not self.debug: choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N') if choice != 'y': return try: # Save the page page.put(text) except wikipedia.LockedPage: wikipedia.output(u"Page %s is locked; skipping." % page.aslink()) except wikipedia.EditConflict: wikipedia.output(u'Skipping %s because of edit conflict' % (page.title())) except wikipedia.SpamfilterError, error: wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url)) else: self.editCount += 1 def main(): DEBUG = False # True bot = OCLCBot(DEBUG) with warnings.catch_warnings(): warnings.simplefilter("ignore") bot.run() if __name__ == "__main__": try: main() finally: wikipedia.stopme()