DatRem.py

mycatlist=list()
pagelist=list()
datelistst=list()
datelistend=list()
removed=0
added=0


Main Program

from wikitools import *
import time
import datetime
import urllib
import json
import userpassbot #Bot password
import warnings
import re
import mwparserfromhell
import datetime
import sys
import DatRem

site = wiki.Wiki() #Tell Python to use the English Wikipedia's API
site.login(userpassbot.username, userpassbot.password) #login

#routine to autoswitch some of the output - as filenames have accented chars!
def pnt(s):
    try:
        print(s)
    except UnicodeEncodeError:
        print(s.encode('utf-8'))

      
def startAllowed():
    textpage = page.Page(site, "User:RonBot/8/Run").getWikiText()
    if textpage == "Run":
        return "run"
    else:
        return "no"

def allow_bots(text, user):
    user = user.lower().strip()
    text = mwparserfromhell.parse(text)
    for tl in text.filter_templates():
        if tl.name.matches(['bots', 'nobots']):
            break
    else:
        return True
    print "template found" #Have we found one
    for param in tl.params:
        bots = [x.lower().strip() for x in param.value.split(",")]
	if param.name == 'allow':
            print "We have an ALLOW" # allow found
            if ''.join(bots) == 'none': return False
            for bot in bots:
                if bot in (user, 'all'):
                    return True
        elif param.name == 'deny':
            print "We have a DENY" # deny found
            if ''.join(bots) == 'none':
                print "none - true"
                return True
	    for bot in bots:
                if bot in (user, 'all'):
                    pnt(bot)
                    pnt(user)
                    print "all - false"
                    return False
    if (tl.name.matches('nobots') and len(tl.params) == 0):
        print "match - false"
        return False
    return True

def findpages(nextcat):
    lastContinue=''
    touse=''
    print nextcat
    while True:
        params = {'action':'query',
                  'list':'categorymembers',
                  'cmtitle':nextcat,
                  'cmlimit':'max',
                  'cmcontinue':lastContinue
                  }
        req = api.APIRequest(site, params) #Set the API request
        res = req.query(False) #Send the API request and store the result in res
        touse = pagelist.listFromQuery(site, res['query']['categorymembers'])#Make a list
        pnt(touse)
        for filep in touse: #For page in the list
            pagename=filep.unprefixedtitle
            if "Category" in pagename:
                if pagename not in DatRem.mycatlist:
                    DatRem.mycatlist.append(pagename)
                    pnt("APPENDING "+pagename)
                    print len(DatRem.mycatlist)
                else:
                    pnt("NOT APPENDING "+pagename) 
            else:
                if pagename not in DatRem.pagelist: #Have we a unique page name?
                    DatRem.pagelist.append(pagename)
                    pnt(pagename)
                else:
                    print "page in list"
        if 'continue' not in res:
            break
        lastContinue = res['continue']['cmcontinue']
        print "continue"
    return 

def examinetext(text):
    DatRem.datelistst=()
    DatRem.datelistst=list()
    DatRem.datelistend=()
    DatRem.datelistend=list()
    last=0
    for match in re.finditer(r'^=\s(January|February|March|April|May|June|July|August|September|October|November|December)[\S\s]*?=$',text,re.MULTILINE):
        foundstart=match.start()
        foundend=match.end()
        founddate=text[match.start():match.end()]
        pnt(founddate)
        DatRem.datelistst.append(match.start())
        DatRem.datelistend.append(match.end())
        print match.start(), match.end(), match.start()-last
        last=match.start()
    #We need list in revese as numbers will change as we remove text, so work from bottom up.
    print "Reverse"
    DatRem.datelistst.reverse()
    DatRem.datelistend.reverse()
    ListLen=len(DatRem.datelistst)
    LastStart=DatRem.datelistend[0]+11
    #We need to look for start later date - end earlier date is, say <10, and test for just wite space inbetween
    for loopvar in range(0, ListLen):
        print DatRem.datelistst[loopvar], DatRem.datelistend[loopvar]
        print LastStart-DatRem.datelistend[loopvar]
        print LastStart
        if LastStart-DatRem.datelistend[loopvar]<10:
            print "Remove"
            losetext=text[DatRem.datelistst[loopvar]:DatRem.datelistend[loopvar]+1]
            print repr(losetext)
            gap=text[LastStart-2:DatRem.datelistend[loopvar]+2]
            print repr(gap)
            if gap.isspace():
                print "All WhiteSpace"
                pnt("++++"+losetext+"++++")
                print"-------------------------------"
                print repr(text[0:DatRem.datelistst[loopvar]])
                print"-------------------------------"
                print repr(text[DatRem.datelistend[loopvar]+2:])
                print"-------------------------------"
                text=text[0:DatRem.datelistst[loopvar]]+text[DatRem.datelistend[loopvar]+2:]
                pnt(text)
                DatRem.removed += 1
        LastStart=DatRem.datelistst[loopvar] 
        print LastStart
        print len(text)
    return text


def checkpage():
    size=len(DatRem.pagelist)
    print size
    for pagetitle in DatRem.pagelist:
        pagetitletext = pagetitle.encode('utf-8')
        print pagetitletext
        pagepage = page.Page(site, pagetitle)
        pagetext = pagepage.getWikiText()
        go = allow_bots(pagetext, 'RonBot')# does page allow bots
        if go:
            print"++++++++++++++++++++++++++++++++++++++++"
            print"REMOVAL bot allowed on article"
            pnt(pagetext)
            print len(pagetext)
            pagetext=examinetext(pagetext)
            pnt(pagetext)
            print len(pagetext)
            if DatRem.removed>0:
                try:
                    #pagepage.edit(text=pagetext, bot=True, summary="(Task 8 - uesrpace trial) - Removal of unused date headers") #(DO NOT UNCOMMENT UNTIL BOT IS APPROVED)
                    print "writing changed page"
                except:
                    print"Failed to write"
                print"++++++++++++++++++++++++++++++++++++++++"


    return

def main():
    go = startAllowed() #Check if task is enabled
    DatRem.mycatlist=list()
    DatRem.pagelist=list()
    DatRem.mycatlist.append("Category:Wikpedia Help pages with dated sections")
    DatRem.removed=0
    listnum=0
    while listnum<len(DatRem.mycatlist):
        pnt( "CAT" + DatRem.mycatlist[listnum])
        findpages(DatRem.mycatlist[listnum])
        listnum+=1
        print "LIST No. ", listnum
        print len(DatRem.pagelist)
    #Test System - KILL NEXT TWO LINES
    #DatRem.pagelist=list()
    #DatRem.pagelist.append("User:Ronhjones/Sandbox5")
    if len(DatRem.pagelist)>0:
        checkpage()
    
    print DatRem.removed
    print (time.ctime())
      
if __name__ == "__main__":
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", FutureWarning)
        main()