#!/usr/bin/python
#
#
# HISTORY.PY -- WIKIPEDIA PAGE HISTORY
# Gdr, 2005-05-12
#
#
# INTRODUCTION
#
# This Python library analyzes the history of articles on the English
# Wikipedia.
#
# You must have the Python Wikipedia Robot Framework
# (http://sourceforge.net/projects/pywikipediabot/).
#
#
# LICENCE
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at
# your option) any later version.
import calendar
import re
import time
import wikipedia
edit1_re = re.compile(r'name="oldid" value="(\d+)"'
r'.* title="[^\"]*">([^<]*\d[^<]*)</a>'
r'.* title="(?:(User:[^\"]+)|Special:Contributions)">')
edit2_re = re.compile(r'.* title="[^\"]*">([^<]*\d[^<]*)</a>'
r'.* title="(?:(User:[^\"]+)|Special:Contributions)">')
months = {
'Jan': 1, 'January': 1,
'Feb': 2, 'February': 2,
'Mar': 3, 'March': 3,
'Apr': 4, 'April': 4,
'May': 5, 'May': 5,
'Jun': 6, 'June': 6,
'Jul': 7, 'July': 7,
'Aug': 8, 'August': 8,
'Sep': 9, 'September': 9,
'Oct': 10, 'October': 10,
'Nov': 11, 'November': 11,
'Dec': 12, 'December': 12,
}
def dateParse(date):
# Current time supplies default values.
tm = list(time.gmtime()[:5]) + [0]
# Use slot-filling approach to guess fields.
fields = re.split(r'(?u)[^\w:]+', date)
for field in fields:
if re.match(r'^\d\d\d\d$', field):
# Four digits is a year
tm[0] = int(field)
elif re.match(r'^\d\d?$', field):
# One or two digits is a day
tm[2] = int(field)
elif re.match(r'^\d\d:\d\d$', field):
# 2:2 digits is a time
tm[3] = int(field[0:2])
tm[4] = int(field[3:5])
elif field in months:
# A month name
tm[1] = months[field]
return calendar.timegm(tm)
def historyParse(edit):
m = edit1_re.search(edit)
if m:
return {
'oldid': m.group(1),
'date': dateParse(m.group(2)),
'user': m.group(3)
}
m = edit2_re.search(edit)
if m:
return {
'date': dateParse(m.group(1)),
'user': m.group(2)
}
raise wikipedia.Error("Can't parse edit:\n" + edit)
def historyPage(page, limit = None, offset = None):
"""historyPage(page, limit = None, offset = None)
Get the history of the article given by 'page'. Optional arguments:
'limit' specifies the maximum number of edits to return, and
'offset' says where to start in the history. Returns the history as
a list of dictionaries, one per edit in the history, with keys
'oldid' - the id of the revision following the edit, if known (in
MediaWiki 1.4 the current revision has no id), 'date' - the time of
the edit as a number of seconds since the epoch, and 'user' - the
user who made the edit."""
# Check whether we are not too quickly after the previous putPage, and
# wait a bit until the interval is acceptable
wikipedia.get_throttle()
# Which web-site host are we submitting to?
host = page.site().hostname()
# Get the address of the page on that host.
address = '/w/index.php?title=%s&action=%s'%(page.urlname(),'history')
if limit:
address += '&limit=%d' % limit
if offset:
address += '&offset=%d' % offset
# Get the page.
wikipedia.output(u"Getting history for %s" % page.linkname())
text, charset = wikipedia.getUrl(host, address)
# Extract the edit items.
m = re.search(r'<ul id="pagehistory"><li>(.*)</li></ul>', text, re.M)
if not m:
raise wikipedia.Error("Can't find the list of edits:" + text)
return map(historyParse, m.group(1).split('</li><li>'))
def getOldRevision(page, oldid):
"""getOldRevision(page, oldid)
Returns revision 'oldid' of article given by 'page'."""
wikipedia.get_throttle()
host = page.site().hostname()
address = page.site().edit_address(page.urlname()) + '&oldid=%s' % oldid
print "address = ", address
text, charset = wikipedia.getUrl(host, address, page.site())
return unicode(wikipedia.unescape(re.search('<textarea[^>]*>(.*)</textarea>', text, re.S).group(1)).rstrip(),
charset, errors = 'replace')