# coding: utf-8
import urllib2, time, urllib
import random
#cookielib:
import cookielib
urlopen = urllib2.urlopen
Request = urllib2.Request
cj = cookielib.LWPCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
#################################
# Wikipedia functions #
#################################
def parse(page, tag):
for line in page:
if tag + '''="''' in line:
value=''
for letter in line[line.find(tag + '''="''')+len(tag)+2:]:
if letter=='''"''':return value
value+=letter
def load(name):
data=urllib.urlencode({'format':'xml', 'action':'query','prop':'revisions', 'rvprop':'content', 'titles':name})
loadString='http://en.wikipedia.org/w/api.php?'
page=urllib2.urlopen(loadString, data)
pagestring=''
for i in page.readlines():pagestring+=i
pagestring=pagestring[pagestring.find('''xml:space'''):]
return pagestring[pagestring.find('''>''')+1:pagestring.find('''</rev>''')]
def login():
#Get token
data=urllib.urlencode({'format':'xml', 'action':'login', 'lgname':'PointBot', 'lgpassword':password})
loginString='http://en.wikipedia.org/w/api.php?'
loginpage=urllib2.urlopen(loginString, data)
loginpage=loginpage.readlines()
token=parse(loginpage, 'token')
cookieprefix=parse(loginpage, 'cookieprefix')
sessionid=parse(loginpage, 'sessionid')
#login
data=urllib.urlencode({'enwiki_session':sessionid,'format':'xml', 'action':'login', 'lgname':'PointBot', 'lgpassword':password, 'lgtoken':token})
loginString='http://en.wikipedia.org/w/api.php?'
loginpage=urllib2.urlopen(loginString, data)
loginpage=loginpage.readlines()
lguserid=parse(loginpage, 'lguserid')
lgtoken=parse(loginpage, 'lgtoken')
sessionid=parse(loginpage, 'sessionid')
print 'Login was: ', parse(loginpage, 'result')
print lguserid, lgtoken, sessionid
return lguserid, lgtoken, sessionid
def get_edit_token(name, lguserid, lgtoken, sessionid):
data=urllib.urlencode({'format':'xml', 'action':'query', 'prop':'info|revisions', 'intoken':'edit', 'titles':'Main Page'})
headers={'enwikiUserName':'PointBot','enwikiUserID':lguserid,'enwikiToken':lgtoken, 'enwiki_session':sessionid}
loadString='http://en.wikipedia.org/w/api.php?'
req=urllib2.Request(loadString, data)
page=urllib2.urlopen(req)
page=page.readlines()
timestamp=parse(page, 'timestamp')
edittoken=parse(page, 'edittoken')
return timestamp, edittoken
def edit_full(name, newcontent, timestamp, edittoken, summary):
data=urllib.urlencode({'format':'xml', 'action':'edit', 'title':name, 'summary':summary, 'text':newcontent, 'basetimestamp':timestamp, 'token':edittoken})
loadString='http://en.wikipedia.org/w/api.php?'
page=urllib2.urlopen(loadString, data)
def edit_add(name, newcontent, timestamp, edittoken, summary):
data=urllib.urlencode({'format':'xml', 'action':'edit', 'title':name,'section':'new', 'summary':summary, 'text':newcontent, 'basetimestamp':timestamp, 'token':edittoken})
loadString='http://en.wikipedia.org/w/api.php?'
page=urllib2.urlopen(loadString, data)
def setup():
lguserid, lgtoken, sessionid=login()
timestamp, edittoken=get_edit_token('User:PointBot/log', lguserid, lgtoken, sessionid)
return lguserid, lgtoken, sessionid, edittoken
#################################
# Analysis functions #
#################################
def findNextLink(page):
#grab all the links in page and return random one. This function takes a list. It is useful for randomly surfing wikipedia.
links=[]
for i in range(len(page)-1):
if page[i] == '[':
if page[i+1] == '[':
link=''
j=int(i)+1
while ']' not in link and '|' not in link:
j+=1
link+=page[j]
if ':' not in link:links.append(link[:-1])#if link is not to another wiki, that would be boring.
return random.choice(links)
def getFirstSentence(page):
#This function trys to get the first sentence of a page, but it uses a lot of rules. There's probably a better way to do this.
score=0
found=0
italics=0
for i in range(len(page)):
if page[i] == '{' or page[i] == '[' or page[i] == '(' or page[i] =='<' or page[i:i+4] == '<':score-=1
if page[i] == '}' or page[i] == ']' or page[i] == ')' or page[i] =='>' or page[i:i+4] == '>':score+=1
if page[i] == """'""" and page[i+1] == """'""":
if italics == 0:italics=1
elif italics == 1:italics=0
if score == 0 and italics == 0:
if page[i]=='.' and page[i-2] != ' ' and page[i-2] != '.':
if page[i-3:i-1] != """''""":
found = 1
return page[:i+1]
def verb_in_first_sentence(page):
#checks if a verb is in the sentence.
verbs=['is', 'are', 'were', 'was', 'will', 'refers']
first=getFirstSentence(page)
found=0
for verb in verbs:
if verb in first:
found=1
return found
def run(name, names, lguserid, lgtoken, sessionid, edittoken):
page=load(name)
print 'Checking: ', name
if page!='':
if '''{{disambiguation}}''' not in page and '''{{disambig}}''' not in page and page[0] != '#':#if it is not a disambugation page
if verb_in_first_sentence(page) == 0:#if no correct verb is in first sentence
timestamp, edittoken=get_edit_token(name, lguserid, lgtoken, sessionid)
oldpage=load('User:PointBot/log')
first=first=getFirstSentence(page)
if name not in oldpage:
edit_full('User:PointBot/log', load('User:PointBot/log') + '\n\nArticle [[' + name + ']] lacks a proper descriptive introduction and could use some editing.\n' + str(time.time()), timestamp, edittoken, 'Verb report')
print 'Article ' + name + ' lacks proper descriptive introduction', first
try:
nextname=findNextLink(page)
except:nextname=random.choice(names)
else:nextname=random.choice(names)#if page was disamb
else:nextname=random.choice(names)#if page was blank
return nextname
if True:
password='*********'
#good example: urban design
lguserid, lgtoken, sessionid, edittoken=setup()
names=['wiki']
name=run('wiki', names, lguserid, lgtoken, sessionid, edittoken)
while True:
try:
names.append(name)
name=run(name, names, lguserid, lgtoken, sessionid, edittoken)
except:name=random.choice(names)