DatRem.py
mycatlist=list()
pagelist=list()
datelistst=list()
datelistend=list()
removed=0
added=0
Main Program
from wikitools import *
import time
import datetime
import urllib
import json
import userpassbot #Bot password
import warnings
import re
import mwparserfromhell
import datetime
import sys
import DatRem
site = wiki.Wiki() #Tell Python to use the English Wikipedia's API
site.login(userpassbot.username, userpassbot.password) #login
#routine to autoswitch some of the output - as filenames have accented chars!
def pnt(s):
try:
print(s)
except UnicodeEncodeError:
print(s.encode('utf-8'))
def startAllowed():
textpage = page.Page(site, "User:RonBot/8/Run").getWikiText()
if textpage == "Run":
return "run"
else:
return "no"
def allow_bots(text, user):
user = user.lower().strip()
text = mwparserfromhell.parse(text)
for tl in text.filter_templates():
if tl.name.matches(['bots', 'nobots']):
break
else:
return True
print "template found" #Have we found one
for param in tl.params:
bots = [x.lower().strip() for x in param.value.split(",")]
if param.name == 'allow':
print "We have an ALLOW" # allow found
if ''.join(bots) == 'none': return False
for bot in bots:
if bot in (user, 'all'):
return True
elif param.name == 'deny':
print "We have a DENY" # deny found
if ''.join(bots) == 'none':
print "none - true"
return True
for bot in bots:
if bot in (user, 'all'):
pnt(bot)
pnt(user)
print "all - false"
return False
if (tl.name.matches('nobots') and len(tl.params) == 0):
print "match - false"
return False
return True
def findpages(nextcat):
lastContinue=''
touse=''
print nextcat
while True:
params = {'action':'query',
'list':'categorymembers',
'cmtitle':nextcat,
'cmlimit':'max',
'cmcontinue':lastContinue
}
req = api.APIRequest(site, params) #Set the API request
res = req.query(False) #Send the API request and store the result in res
touse = pagelist.listFromQuery(site, res['query']['categorymembers'])#Make a list
pnt(touse)
for filep in touse: #For page in the list
pagename=filep.unprefixedtitle
if "Category" in pagename:
if pagename not in DatRem.mycatlist:
DatRem.mycatlist.append(pagename)
pnt("APPENDING "+pagename)
print len(DatRem.mycatlist)
else:
pnt("NOT APPENDING "+pagename)
else:
if pagename not in DatRem.pagelist: #Have we a unique page name?
DatRem.pagelist.append(pagename)
pnt(pagename)
else:
print "page in list"
if 'continue' not in res:
break
lastContinue = res['continue']['cmcontinue']
print "continue"
return
def examinetext(text):
DatRem.datelistst=()
DatRem.datelistst=list()
DatRem.datelistend=()
DatRem.datelistend=list()
last=0
for match in re.finditer(r'^=\s(January|February|March|April|May|June|July|August|September|October|November|December)[\S\s]*?=$',text,re.MULTILINE):
foundstart=match.start()
foundend=match.end()
founddate=text[match.start():match.end()]
pnt(founddate)
DatRem.datelistst.append(match.start())
DatRem.datelistend.append(match.end())
print match.start(), match.end(), match.start()-last
last=match.start()
#We need list in revese as numbers will change as we remove text, so work from bottom up.
print "Reverse"
DatRem.datelistst.reverse()
DatRem.datelistend.reverse()
ListLen=len(DatRem.datelistst)
LastStart=DatRem.datelistend[0]+11
#We need to look for start later date - end earlier date is, say <10, and test for just wite space inbetween
for loopvar in range(0, ListLen):
print DatRem.datelistst[loopvar], DatRem.datelistend[loopvar]
print LastStart-DatRem.datelistend[loopvar]
print LastStart
if LastStart-DatRem.datelistend[loopvar]<10:
print "Remove"
losetext=text[DatRem.datelistst[loopvar]:DatRem.datelistend[loopvar]+1]
print repr(losetext)
gap=text[LastStart-2:DatRem.datelistend[loopvar]+2]
print repr(gap)
if gap.isspace():
print "All WhiteSpace"
pnt("++++"+losetext+"++++")
print"-------------------------------"
print repr(text[0:DatRem.datelistst[loopvar]])
print"-------------------------------"
print repr(text[DatRem.datelistend[loopvar]+2:])
print"-------------------------------"
text=text[0:DatRem.datelistst[loopvar]]+text[DatRem.datelistend[loopvar]+2:]
pnt(text)
DatRem.removed += 1
LastStart=DatRem.datelistst[loopvar]
print LastStart
print len(text)
return text
def checkpage():
size=len(DatRem.pagelist)
print size
for pagetitle in DatRem.pagelist:
pagetitletext = pagetitle.encode('utf-8')
print pagetitletext
pagepage = page.Page(site, pagetitle)
pagetext = pagepage.getWikiText()
go = allow_bots(pagetext, 'RonBot')# does page allow bots
if go:
print"++++++++++++++++++++++++++++++++++++++++"
print"REMOVAL bot allowed on article"
pnt(pagetext)
print len(pagetext)
pagetext=examinetext(pagetext)
pnt(pagetext)
print len(pagetext)
if DatRem.removed>0:
try:
#pagepage.edit(text=pagetext, bot=True, summary="(Task 8 - uesrpace trial) - Removal of unused date headers") #(DO NOT UNCOMMENT UNTIL BOT IS APPROVED)
print "writing changed page"
except:
print"Failed to write"
print"++++++++++++++++++++++++++++++++++++++++"
return
def main():
go = startAllowed() #Check if task is enabled
DatRem.mycatlist=list()
DatRem.pagelist=list()
DatRem.mycatlist.append("Category:Wikpedia Help pages with dated sections")
DatRem.removed=0
listnum=0
while listnum<len(DatRem.mycatlist):
pnt( "CAT" + DatRem.mycatlist[listnum])
findpages(DatRem.mycatlist[listnum])
listnum+=1
print "LIST No. ", listnum
print len(DatRem.pagelist)
#Test System - KILL NEXT TWO LINES
#DatRem.pagelist=list()
#DatRem.pagelist.append("User:Ronhjones/Sandbox5")
if len(DatRem.pagelist)>0:
checkpage()
print DatRem.removed
print (time.ctime())
if __name__ == "__main__":
with warnings.catch_warnings():
warnings.simplefilter("ignore", FutureWarning)
main()