This is the code I used to calculate Wikipedia articles which have only seen one human editor (usually the page creator). The last time I ran this was two years ago, it produced a list about 2000 entries long which since has been whittled down to about 100 or so - in other words all but one hundred have seen review. I'll probably run this script again soon, accounting for those articles already reviewed from the first run. When I do that, I'll clean these up, re-organize, give more meaningful filenames, etc.
xmlsplitter.py
edit#XMLsplitter.py
#V03
#Released under GNU GPLv3 by Monk of the Highest Order, 2008.
#Partitions a giant XML document
#into smaller documents without breaking content across
#a selected element. So for example, if the element is
#<artist> all data between that and the </artist> tag is kept in the
#same doc.
import re, random
from utility import *
from sys import exit
#example exml doc:
#<base>
# <mid1>
# <mid2>
# <pageunit>
# change10
# change 9
# </pageunit>
# ...repeat x100000000000....
# </mid2>
# </mid1>
#<base>
#basic idea: Strip base, mid1, mid2 (why even worry)
# just make files which just contain distinct page data
# I wouldn't make 1 file for each page. Not sure the
# file system could handle 2mil files. I'd stay safe
# at something like 2k. Named numerically, probably,
# so we don't need to get into title extraction
# so 2.2 mil / 2000 files = 1.1*10^3 pages ea.
def interpret(textline, pagecount, parent_tags, data_to_get):
if re.search('(?i)</' + data_to_get + '>', textline):
pagecount+=1
#print('page' + str(pagecount))
elif re.search('(?i)^\s*</?(' + parent_tags + ')>\s*$',
textline):
return None, pagecount
return textline, pagecount
def get_pages_per_file(rel_position):
#input a float giving the relative
#position of the file break-up-er
#in the big meta file, where 0.0 == the beginning
#and 1.0 == the end.
if rel_position < 0.1:
pages_per_file=200
elif rel_position < 0.3:
pages_per_file=800
elif rel_position < 0.5:
pages_per_file=1200
elif rel_position < 1.0:
pages_per_file=2600
else:
print("error! no rel position within 0-1.0", repr(rel_position))
exit()
return pages_per_file
def main():
(sourcexml, pos, filenum)=unpickle_data('xmlsplitter.tmp',
['1008smh.xml', 0, 1])
nwiki=2600000. #estimate of the number of elements
nfilegoal=16000. #estimate of number of pages desired
output_folder='output/'
data_to_get='page'
parent_tags='mediawiki'
fbig=open(sourcexml, 'r')
fbig.seek(0, 2)
eof_loc = fbig.tell()
fbig.seek(pos)
pages_per_file = get_pages_per_file(pos/float(eof_loc))
while fbig.tell() < eof_loc:
if filenum >= nfilegoal: exit()
newblock = []
pagecount = 0
fblock = open(output_folder + \
str(filenum) + '.block', 'w')
next=fbig.tell()
while pagecount < pages_per_file and next < eof_loc:
prev=next
try:newline, pagecount = interpret(fbig.readline(),
pagecount, parent_tags, data_to_get)
except IOError:
print("IOError... waiting it out.")
fbig.seek(prev+30)
pass
next=fbig.tell()
if next > eof_loc:
next=prev+30
fbig.seek(next)
if newline: newblock.append(newline)
newblock.append('</block>')
newblock.insert(0,'<block>\n')
print(fbig.tell(), eof_loc)
fblock.writelines(newblock)
fblock.flush()
fblock.close()
rel_position=fbig.tell()/float(eof_loc)
pages_per_file=get_pages_per_file(rel_position)
print("File " + str(filenum) + " (" + \
str(int(rel_position*100)) + \
"%) written.")
filenum+=1
pickle_data('xmlsplitter.tmp', [sourcexml, fbig.tell(), filenum])
if __name__ == '__main__':
main()
parser.py
edit#The structure of this program is designed not around speed, but around
#memory constraints. It is assumed that you have lotsa space and lotsa time.
#TODO:
#Output file
#Cleanup constants -> (eg, one file should handle the constant locations of
# the bot list, the redirect list, ids-editors db, one-editor folder, etc.
# probably this folder)
import sys
import re
import csv
import optparse
from xml.sax import make_parser, handler
import sqlite3
from glob import glob
try: from urllib.parse import quote
except: from urllib import quote
import utility
import pageparser_db
import wiki_pageset
import one_authorize
from xml_to_pageset import WikiXMLParser
BOT_NAMES_LIST='bot_list.txt'
BOT_IDS_LIST='bot_list_ids.txt'
def get_bots_list(value='names'):
try:
if value=='names':
fbots=open(BOT_NAMES_LIST,'r')
elif value=='ids':
fbots=open(BOT_IDS_LIST,'r')
bots=fbots.readlines()
for i in range(len(bots)-1):
bots[i] = quote(bots[i].rstrip())
fbots.close()
bots.append('Conversion%20script')
return sorted(bots)
except IOError:
print(" error: could not read one of bots list filez")
sys.exit()
##### Command System #####
if __name__=='__main__':
command = optparse.OptionParser()
command.set_usage("""
Usage: parser.py [-v/-q]
[-1 1.xml 2.xml 3.xml...]
[-f 1.xml.csv 2.xml.csv...]
[-2 1.xml.csv 2.xml.csv...]
[-3 1.xml.csv 2.xml.csv...]
[-4 1.xml.csv 2.xml.csv...]
[-5 1.xml.inx.csv 2.xml.inx.csv...]
""")
command.add_option("-1", "--xml_decode",
action="store_true",
dest="xml_decode",
help="XML -> CSV 'pageset' of pagename, pageid, editorid, and edits by editor id")
command.add_option("-f", "--filter_csv",
action="store_true",
dest="filter_csv",
help="refilter a csv file for bots, userpages, etc...")
command.add_option("-2", "--fill-editor-db",
action="store_true",
dest="fill_editor_db",
help="add CSV pageset data to: sqlite db of edit count per page by each user.")
command.add_option("-t", "--tally-editor-db",
action="store_true",
dest="tally",
help="run (2) on every pageset available, then run this, before using option (4)")
command.add_option("-3", "--one-editor",
action="store_true",
dest="one_editor",
help="CSV pageset -> new CSV with one-editor pages only")
command.add_option("-4", "--inexp-editor",
action="store_true",
dest="inexp_editor",
help="""CSV pageset -> new CSV with with one author only,
with that author having less than 15 edits to his name
(completely fill the SQLITE database b4 using this option).""")
command.add_option("-5", "--title-list",
action="store_true",
dest="title_list",
help="CSV pageset -> list of pages within by title")
command.add_option("-i", "--id-list",
action="store_true",
dest="id_list",
help="CSV pageset -> list of pages within by id")
command.add_option("--gt_ids",
action="store",
dest="gt_ids",
help="necessary for -4: list of the userids whose editcounts qualify them as experienced")
command.add_option("--gt_ips",
action="store",
dest="gt_ips",
help="necessary for -4: list of the ips whose editcounts qualify them as experienced")
#command.add_option("-X", "--mult-editors",
# action="store_true",
# dest="make_list",
# help="CSV pageset -> new CSV of pages with more than one editor.")
command.add_option("-v", "--verbose",
action="store_true",
dest="output_verbose",
help="option: give lots of debug output")
command.add_option("-q", "--quiet",
action="store_true",
dest="output_quiet",
help="option: No command line output")
(options, args) = command.parse_args(sys.argv[1:])
if options.output_quiet:
verbose=0
elif options.output_verbose:
verbose=2
else:
verbose=1
#testing for usability of command line options...
operations=options.__dict__
j=0
for i in operations:
if i not in ['output_quiet', 'output_verbose', 'gt_ids', 'gt_ips'] and \
operations[i]:
if verbose: print(str(i))
j+=1
if j==2:
print(str(i))
command.print_usage()
sys.exit()
if j==0:
command.print_usage()
sys.exit()
if True:
#if we're using an option which only uses file(s) as the argument(s)
if not args:
print(' error:this operation requires at least one file argument')
sys.exit()
elif [] in [glob(x) for x in args]:
print(' error:this operation requires all arguments to be files.')
sys.exit()
args=utility.glob_list(args)
###### operations ######
if options.xml_decode:
parser = make_parser()
parser.setContentHandler(WikiXMLParser(verbose=verbose))
cleaner = wiki_pageset.PageFilter(verbose=verbose,
bot_ids=get_bots_list('ids'), bot_names=get_bots_list('names'))
for arg in args:
if verbose: print(" opening file",arg)
parser.parse(arg)
pages=cleaner.clean(parser.getContentHandler().pages,
rm_bot_revisions=True,
rm_user_talk=True,
rm_redirects=True,
associate_to=False,
associate_from=True,
rm_usernames=True)
if verbose: print(" done.")
csv_store_pageset(arg+'.csv', pages)
elif options.filter_csv:
cleaner = wiki_pageset.PageFilter(verbose=verbose,
bot_ids=get_bots_list('ids'), bot_names=get_bots_list('names'))
for arg in args:
if verbose: print(" opening file",arg)
pageset=wiki_pageset.csv_load_pageset(arg)
pageset=cleaner.clean(pageset,
rm_bot_revisions=False,
rm_user_talk=True,
rm_redirects=False,
associate_to=False,
associate_from=False,
rm_usernames=False)
if verbose: print(" done.")
wiki_pageset.csv_store_pageset(arg[:-4] + '.f.csv', pageset)
elif options.fill_editor_db:
editor_db = one_authorize.EditsByUser(verbose=verbose)
for arg in args:
if verbose: print(" opening file",arg)
pageset=wiki_pageset.csv_load_pageset(arg)
userids, ip_addrs=editor_db.get_edits_by_user(pageset)
utility.csv_write(arg[:-4]+'.editors_ids.csv', userids)
utility.csv_write(arg[:-4]+'.editors_ips.csv', ip_addrs)
elif options.one_editor:
for arg in args:
if verbose: print(" opening file",arg)
pageset=wiki_pageset.csv_load_pageset(arg)
pageset2=[]
for page in pageset:
editors=set()
if verbose==2: print(" going thru pageset")
for revision in page.revisions:
editors.add(revision["contributorID"])
if len(editors)>1:
break
else:
pageset2.append(page)
wiki_pageset.csv_store_pageset(arg[:-4]+'.one_edtr', pageset2)
if verbose: print(" done")
elif options.inexp_editor:
for arg in args:
if verbose: print("opening file", arg)
pageset_listform=utility.csv_read(arg)
if not options.gt_ips or not options.gt_ids:
print("""ERROR. you need to provide a list of
'experienced users' for this operation... both
by ip and userid. see --help""")
sys.exit()
editor_db = one_authorize.EditsByUser(verbose=verbose)
pageset2 =editor_db.get_inx_pages(pageset_listform,
ips_gt=options.gt_ips,
ids_gt=options.gt_ids)
utility.csv_write(arg[:-4]+'.inx_edtr', pageset2)
if verbose: print(" done")
elif options.id_list or options.title_list:
if options.idlist:
ext='.pageids'
columnpos=1
else:
ext='.titles'
columnpos=0
for arg in args:
if verbose: print(" opening file",arg)
f_arg=open(arg,'r')
f_output=open(arg+ext,'w')
f_arg.seek(0,2)
eof_loc=f_arg.tell()
f_arg.seek(0)
while f_arg.tell() < eof_loc:
line_buffer=[]
for i in range(800):
line_buffer.append(f_arg.readline())
line_buffer.remove('') #in case we exceed the end of the file
if verbose: print(" progress:", float(100*f_arg.tell())/eof_loc)
splitted=wiki_pageset.csv_load_pageset(line_buffer, isfile=False)
page_attr_list=[x[columnpos] + '\n' for x in splitted]
f_output.writelines(page_attr_list)
f_output.flush()
del splitted
del page_attr_list
f_titles.close()
if verbose: print(" done.",arg)
elif options.tally:
editor_db = one_authorize.EditsByUser(verbose=verbose)
if verbose: print(" start")
editor_db.fill_edit_db(input_files=args, editcount_folder='/opt/editcounts/')
if verbose: print(" done")
pageparser_db.py
editmuch of this is obsolete and no longer used... sqlite is rather no good for some high load things, I feel. :* just kidding, I'm just no good at sqlite optimization
import sqlite3,sys
#5555555555555555
# DB operations 5
#5555555555555555
ID_TO_NAME = {}
ID_TO_NAME['filename']='ids_to_names.sqlite'
ID_TO_NAME['creation_schema']="CREATE TABLE contributors(contributorID text PRIMARY KEY,username text)" #USERIDS MUST NOT BE STRONGLY TYPED AS INTS: SEVERAL OF THE EARLIER IDS WERE IN ASCII, AND ARE NOT CONVERTIBLE TO INTS.
ID_TO_NAME['table_list']=['contributors']
EDITCOUNT = {}
EDITCOUNT['filename']='editcount.sqlite'
EDITCOUNT['creation_schema']="CREATE TABLE total_edits(contributorID INTEGER PRIMARY KEY,editcount INTEGER)" #USERIDS MUST NOT BE STRONGLY TYPED AS INTS (in sqlite, int is the only type which can be strongly typed, and that is by using the term INTEGER): SEVERAL OF THE EARLIER IDS WERE IN ASCII, AND ARE NOT CONVERTIBLE TO INTS.
EDITCOUNT['table_list']=['total_edits']
REDIRECTS = {}
REDIRECTS['filename']='redirects.sqlite'
REDIRECTS['creation_schema']="CREATE TABLE redirects(idnum INTEGER NOT NULL UNIQUE)"
REDIRECTS['table_list']=['redirects']
def connect_base(filename, creation_schema, table_list):
base=sqlite3.connect(filename)
cu=base.cursor()
cu.execute("select tbl_name from sqlite_master where type='table' order by tbl_name")
tables = []
for row in cu.fetchall():
tables.extend(row)
#print(repr(tables))
if tables==[]:
cu.execute(creation_schema)
base.commit()
elif table_list[0] not in tables:
print(filename, " db has unknown schema. please fix manually.")
sys.exit()
return base, cu
def connect_contributor_id_base():
return connect_base(ID_TO_NAME['filename'],
ID_TO_NAME['creation_schema'], ID_TO_NAME['table_list'])
def connect_editcount_base(basemodulo):
return connect_base('/opt/editcounts/'+str(basemodulo)+EDITCOUNT['filename'],
EDITCOUNT['creation_schema'], EDITCOUNT['table_list'])
def connect_redirect_base():
return connect_base(REDIRECTS['filename'],
REDIRECTS['creation_schema'], REDIRECTS['table_list'])
# <<<<<<<<<<<<>>>>>>>>>>>>
# < ID_to_Name functions >
# <<<<<<<<<<<<>>>>>>>>>>>>
def associate(contributorID, username):
base, cu = connect_contributor_id_base()
try:
results=cu.execute('INSERT INTO contributors(contributorID,username) values (?,?)', (contributorID,username))
except sqlite3.IntegrityError:
return None
base.commit()
base.close()
return results
def get_username(contributorID):
base, cu = connect_contributor_id_base()
cu.execute('SELECT username FROM contributors WHERE contributorID=?',(contributorID,))
rows=[]
for row in cu.fetchall():
rows.extend(row)
base.close()
return rows
wiki_pageset.py
editfor understanding and filtering sets of page history for bots, redirects, etc. parser.py is used to load and call the classes and functions in here, usually.
import utility, pageparser_db, sqlite3
try: from urllib.parse import quote
except: from urllib import quote
from time import time #for benchmarking purposes
class PageHistory():
def __init__(self):
self.title=None
self.idnum=None
self.revisions=[]
def csv_store_pageset(filename, cleaned_pageset):
'''a pageset is a list [] of PageHistory objects'''
#WARNING: strips all username and character data
writable_pageset = [utility.flatten_list([page.title, page.idnum,
[revision['contributorID'] for revision in \
page.revisions]]) for page in cleaned_pageset]
#for page in cleaned_pageset:
# page.revisions = [revision['contributorID'] for revision in page.revisions]
# writable_pageset[-1].extend([page.title, page.idnum, page.revisions])
if filename.split('.')[-1] !='csv':
filename+='.csv'
utility.csv_write(filename,writable_pageset)
return True
def csv_load_pageset(filename, isfile=True):
pageset=[]
csv_data = utility.csv_read(filename, isfile)
for row in csv_data:
pageset.append(PageHistory())
pageset[-1].title=row[0]
pageset[-1].idnum=row[1]
pageset[-1].revisions=[{'contributorID':contributorID, 'username':'', 'comment':''} for contributorID in row[2:]]
return pageset
######################
# Massive pageset filterer
######################
class PageFilter():
def __init__(self, verbose=0,bot_names=[],bot_ids=[]):
self.verbose=verbose
if self.verbose: print(" loading data to clean pagesets")
#redirect stuff....
#int version (by pageid, but those don't always work, trust me...
"""redirect_list=[int(x) for x in redirect_list]
dictum={}
for i in range(100):
dictum[i]=[]
for item in redirect_list:
dictum[item % 100].append(item)"""
#str version
#f_r_list=open('TLR4')
#redirect_list=f_r_list.readlines()
#dictum={}
#for item in redirect_list:
# if item[:2] not in dictum:
# dictum[item[:2]]=[]
# dictum[item[:2]].append(item.rstrip())
#PageFilter.redirect_complex=dictum
#del redirect_list
#f_r_list.close()
PageFilter.bot_ids=bot_ids
PageFilter.bot_names=bot_names
if self.verbose==2: print(" Connecting to sqlite database of userid-username pairs.")
#sqlite database with a single table with userid as primary key and username as the other value
PageFilter.id_base, PageFilter.id_cu= \
pageparser_db.connect_contributor_id_base()
def clean(self, pageset, rm_bot_revisions=True, rm_user_talk=True,
rm_redirects=True, associate_to=False, associate_from=False,
rm_usernames=True):
if verbose==2:timer={"redirects":0,"user_talk":0,
"associate to/from":0, "revisions":0, "bot_revisions":0,
"bots2":0, "rm_usernames":0, "rm_unnec_revisions":0,
"rm_unnec_pages":0,"commit":0}
if verbose==2: eop=len(pageset)
if verbose==2: prev='0'
unnec_pages = []
if associate_from: PageFilter.id_cu.execute('BEGIN;')
for pagenum in range(len(pageset)):
if verbose==2: tmptime=time()
if verbose==2: cur=str(int((pagenum/float(eop))*100))
if verbose==2:
if self.verbose and len(cur)>1 and cur[0] != prev[0]: print(cur)
if verbose==2: prev=cur
if rm_redirects:
title=pageset[pagenum].title
#idnum=int(pageset[pagenum].idnum)
#if idnum in PageFilter.redirect_complex[idnum%100]:
if title[:2] in PageFilter.redirect_complex and \
title in PageFilter.redirect_complex[title[:2]]:
if verbose==2: timer['rm_unnec_pages']+=1
if verbose==3: print('found redirect', title)
unnec_pages.append(pagenum)
if verbose==2: timer['redirects']+=(time()-tmptime)
if verbose==2: tmptime=time()
if rm_user_talk:
if re.search('(?i)^(talk|help((\s|\%20)talk)?|wikipedia((\s|\%20)talk)?|user((\s|\%20)talk)?|image((\s|\%20)talk)?|file((\s|\%20)talk)?|category((\s|\%20)talk)?|template((\s|\%20)talk)?|portal((\s|\%20)talk)?)(:|\%3A)',
pageset[pagenum].title):
unnec_pages.append(pagenum)
continue
if verbose==2: timer['user_talk']+=(time()-tmptime)
unnec_revisions=[]
for revision_num in range(len(pageset[pagenum].revisions)):
revision=pageset[pagenum].revisions[revision_num]
if verbose==2: tmptime=time()
if associate_to:
PageFilter.id_cu.execute('SELECT username FROM contributors WHERE contributorID=?',(revision['contributorID'],))
name=PageFilter.id_cu.fetchone()
if name:
pageset[pagenum].revisions[revision_num]['username'] = name[0]
elif associate_from and revision['username']: #associate from pageset into base
try:
PageFilter.id_cu.execute('INSERT INTO ' + \
'contributors(contributorID,username)' + \
'values (?,?)', (revision['contributorID'],
str(revision['username'])))
except sqlite3.IntegrityError:
pass
if verbose==2: timer['associate to/from']+=(time()-tmptime)
if rm_bot_revisions:
if verbose==2:tmptime=time()
if revision['username'] in PageFilter.bot_names or \
revision['contributorID'] in PageFilter.bot_ids:
unnec_revisions.append(revision_num)
if verbose==2: timer['bot_revisions']+=1
elif 'bot' in revision['username'][-4:].lower() or \
'bot' in revision['comment'].lower():
#print("possible bot detection - ", revision['username'],
#"not on list...")
unnec_revisions.append(revision_num)
if verbose==2: timer['bot_revisions']+=1
if verbose==2: timer['revisions']+=1
if verbose==2: timer['bots2']+=(time()-tmptime)
if verbose==2: tmptime=time()
if rm_usernames:
pageset[pagenum].revisions[revision_num] = {'contributorID':revision['contributorID']} #this must occur AFTER botcheck.
if verbose==2: timer['rm_usernames']+=(time()-tmptime)
unnec_revisions.reverse() #items must be removed in reverse order
#or a removal will shift the index numbers of all later list items
for entry_num in unnec_revisions:
del pageset[pagenum].revisions[entry_num]
#tmptime=time()
if verbose==2: timer['commit']=len(pageset)
unnec_pages.reverse() #items must be removed in reverse order
for entry_num in unnec_pages:
del pageset[entry_num]
#timer['rm_unnec_pages']+=(time()-tmptime)
#tmptime=time()
if verbose==2: print(" committing id base")
PageFilter.id_base.commit()
#timer['commit']+=(time()-tmptime)
if verbose==2: print(" done cleaning.")
if verbose==2:
for i in timer:
print(" ", i, " | ", str(timer[i])[:5])
return pageset
def only_one_contributor(pageset):
one_author_pageset=[]
for pagehistory in pageset:
num_authors=set([x['contributorID'] for x in pagehistory.revisions])
if len(num_authors)==1:
one_author_pageset.append(pagehistory)
return one_author_pageset
xml_to_pageset.py
editThe core function of making use of all that xml. parser.py is used to load and call the classes and functions in here, usually.
from xml.sax import make_parser, handler
try: from urllib.parse import quote
except: from urllib import quote
import wiki_pageset
class WikiXMLParser(handler.ContentHandler):
"""Converts the XML data into a form that can be more
easily handled en Masse by python. While it is doing
this, it strips the data of everything but page titles,
page ids, and a list of revisions for each page. The
list of revisions includes only the contributor and the
comment, (including both the comment and the contributor
name as well as ID or IP as to provide an opportunity to
filter out bots), and does not even include dates"""
important_tags = {
('contributor','revision'):'contributor',
('username','contributor'):'username',
('comment','revision'):'comment',
('revision','page'):'revision',
('id','page'):'pageID',
('id','contributor'):'contributorID',
('ip','contributor'):'contributorID',
('title','page'):'pagetitle'
}
important_tags_reverse={}
for tag in important_tags:
important_tags_reverse[(tag[0],important_tags[tag])]=tag[1]
def __init__(self, verbose=0):
self.verbose=verbose
pass
def set_filename(self, filename): self.filename=filename
def startDocument(self):
self._elems = 0
self._attrs = 0
self.pages = []
self.parent = 'page'
self.current = None
if self.verbose: print(' reading XML...')
def startElement(self, name, attrs):
self._elems = self._elems + 1
#self._attrs = self._attrs + len(attrs)
if name == 'page':
self.current = wiki_pageset.PageHistory()
self.parent='page'
elif name == 'revision':
self.current.revisions.append({'contributorID':'', 'username':'', 'comment':''})
self.parent = 'revision'
elif (name,self.parent) in FancyCounter.important_tags:
self.parent = FancyCounter.important_tags[(name,self.parent)]
def endElement(self, name):
if name == 'page':
self.pages.append(self.current)
del self.current
elif (name,self.parent) in FancyCounter.important_tags_reverse:
self.parent=FancyCounter.important_tags_reverse[(name,self.parent)]
def characters(self, content):
if self.parent == 'pagetitle':
self.current.title = quote(content)
elif self.parent == 'pageID':
self.current.idnum = content
elif self.parent in ['contributorID', 'username', 'comment']:
self.current.revisions[-1][self.parent]=quote(content)
def endDocument(self):
if self.verbose: print(" cool stats: ", self._elems, "elements.")
#if self.verbose: print(" There were", self._attrs, "attributes.")
return self.pages
one_authorize.py
editAll-in-one for creating a tally of how many edits each author has made (on the assumption of a complete and non-redundant set of csv pagesets) and for removing pages from a pageset based on user editcounts parser.py is used to load and call the classes and functions in here, usually.
from wiki_pageset import PageHistory
from math import ceil, floor
from time import time
import re,operator,os,sys
import utility
class EditsByUser():
def __init__(self, verbose=0):
self.verbose=verbose
#if self.verbose==2: print(" Connecting to sqlite database of userid edit tables.")
#sqlite database with a single table with userid as primary key and username as the other value
#PageFilter.edit_bases={}
#PageFilter.edit_cursors={}
#for i in range(1000):
# PageFilter.edit_bases[i], PageFilter.edit_cursors[i]= \
# pageparser_db.connect_editcount_base(i)
self.interval_dicts_done=0
self.ip_regex=re.compile('^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$')
def ip_to_int(self, valuelist):
return int(valuelist[0])*16777216+\
int(valuelist[1])*65536+\
int(valuelist[2])*256+\
int(valuelist[3])
def int_to_ipstr(self, number):
ip= [((number%(256**4))/256**3),
((number%(256**3))/256**2),
((number%(256**2))/256**1),
((number%(256**1))/256**0)]
return '.'.join([str(x) for x in ip])
def get_edits_by_user(self, pageset):
#get data of pageset
if self.verbose: print(" organizing editor data for storage")
ip_list={}
id_list={}
for page in pageset:
for revision in page.revisions:
userid=revision['contributorID']
is_ip=re.findall(self.ip_regex,userid)
if is_ip:
userid=self.ip_to_int(is_ip[0])
if userid not in ip_list:
ip_list[userid]=1
else: ip_list[userid]+=1
elif re.search('^\d+$', userid):
userid=int(userid)
if userid not in id_list:
id_list[userid]=1
else: id_list[userid]+=1
id_list=[[x,id_list[x]] for x in sorted(id_list)]
ip_list=[[x,ip_list[x]] for x in sorted(ip_list)]
return id_list,ip_list
def interval_dicts(self):
self.id_dict = {
'upper':10000000, #in reality, currently users peak at 8mili, but at least for the next year and a half or so it'll stay under ten mill, prob.
'lower':1,
'ext':'ids'
}
self.ip_dict = {
'upper':4294967296,
'lower':16777216,
'ext':'ips'
}
self.user_dicts={'ip':self.ip_dict,'id':self.id_dict}
for user_dict in [self.ip_dict, self.id_dict]:
user_dict['interval']=ceil(float(user_dict['upper']-user_dict['lower'])/100)
user_dict['user_blocks']=[]
user_dict['input_files']={} #although it's not inconceivable that base-10 IPs and IDs could be stored in harmony in the same file, I suspect that, barring some kind of apocalyptic kinda thing, or peak oil, the number of editors will double in the next decade, resulting in the inevitable collision. While adjusting the upper limits of users is a predictable problem, this is something which would be hard to figure out. yah like this script is going 4 ten years.
for i in range(100):
user_dict['user_blocks'].append(i*user_dict['interval'])
def fill_edit_db(self, input_files=[], editcount_folder='/opt/editcounts/'):
if self.interval_dicts_done==0:
self.interval_dicts()
#input files: a list of valid file addresses, each of which either contains a list of base-10 IPs or wikipedia editor IDs with a number of edits next to it.
#editcount_folder - the folder to put the total counts.
if self.verbose: print(' Categorizing input editcount files')
for filename in input_files: #all files are assumed to exist at this point, and be a
boxed=False
for user_dict in [self.ip_dict, self.id_dict]:
if user_dict['ext'] in filename:
boxed=True
user_dict['input_files'][filename]=0
if not boxed:
print("Error! The filename", filename, " is not clearly distinguishable as either an ip or userid editcount file.")
for user_dict in [self.id_dict]:
if len(user_dict['input_files'])==0:
if self.verbose: print(' Beginning editcount set ' + user_dict['ext'])
if self.verbose: print(' Found no files which contained editcounts by ' + user_dict['ext'])
continue
for block_num,block in enumerate(user_dict['user_blocks']): #ranges of possible user ids or ips
#for block_num in range(23,24): #ranges of possible user ids or ips
if self.verbose: print(' starting new block', block_num, 'out of 100 blocks...')
loc_block=editcount_folder+'edits.'+user_dict['ext']+'.'+str(block)+'.txt'
block_data={}
if os.path.isfile(loc_block):
if self.verbose: print(' loading old block data', loc_block)
unformatted=[[int(x),int(y)] for x,y in utility.csv_read(loc_block)]
block_data=dict(unformatted)
i=0
timer={'open/seek':0,'tell':0,'readline':0,
'interpret':0,'compare':0, 'incl':0}
for filename in sorted(user_dict['input_files']):
i+=1
if i%100==0 and self.verbose==2:
print(os.path.basename(filename))
for item in timer:
print(" ", item, " | ", str(timer[item]))
tmptime=time()
f_source = open(filename,'r')
f_source.seek(user_dict['input_files'][filename])
timer['open/seek']+=(time()-tmptime)
while True:
tmptime=time()
timer['tell']+=(time()-tmptime)
tmptime=time()
user_dict['input_files'][filename]=f_source.tell()
data=f_source.read(2000)
row_block=data.split('\n')
if len(row_block)==1:
break
if row_block[-1] != '':
newdata='bleaugh'
while newdata != '\n' and newdata != '':
newdata=f_source.read(1)
if newdata =='\n':
row_block.append('')
else:
row_block[-1]+=newdata
breaker=False
for row_num in range(len(row_block)):
tmptime=time()
row=row_block[row_num]
if row=='':
break
user,edits=[int(x) for x in row.split(',')]
timer['interpret']+=(time()-tmptime)
tmptime=time()
#if user==2332919:print(filename, "a",user)
if user >= block:
if user >= block+user_dict['interval']:
breaker=True
break
timer['compare']+=(time()-tmptime)
tmptime=time()
if user not in block_data:
block_data[user]=edits
#if user==2332919:print("b",user, block_data[user])
else:
block_data[user]+=edits
#if user==2332919:print("c",user, block_data[user])
timer['incl']+=(time()-tmptime)
#if user==2332919:print("d",user, block_data[user])
if breaker: break
f_source.close()
writable = sorted(block_data.items(),key=operator.itemgetter(0))
f_block=open(loc_block, 'w')
for item in writable:
f_block.write(str(item[0])+','+str(item[1])+'\n')
f_block.flush()
f_block.close()
safety_valve_progress=editcount_folder+\
'safety_valve_progress.'+ user_dict['ext'] + str(block) + '.txt'
utility.csv_write(safety_valve_progress,
sorted(user_dict['input_files'],key=operator.itemgetter(0)))
def activate_gt(self, ips_gt, ids_gt):
try:
if self.gt:
return True
except:
self.gt={'ip':[int(x.rstrip()) for x in open(ips_gt,'r')],
'id':[int(x.rstrip()) for x in open(ids_gt,'r')]}
def get_inx_pages(self, pagelist,
limit=50, ips_gt=None, ids_gt=None):
"""
pagelist=just any list of lists where the last element of each itemlist is a str userid or a str base-256 ip addr.
if the user or ip is found to be inexperienced,
all elements but the last element are included as one of many in a results list.
ips_gt=sorted list of base-10 ips with a number of edits
that exceed the number of edits that qualify
them as 'experienced,' and thus should return a false value.
ips_lt=sorted list of userids, same as above
limit = not implemented yet. in future, will automate creation
and use of ips_gt, ips_lt from the folder where editcounts were tallied by user.
"""
#returns a list of only the pages which have *less* edits than the limit
results=[]
pagelist2={'ip':[],'id':[]}
for page in pagelist:
userid=page[-1]
is_ip=re.findall(self.ip_regex,userid)
if is_ip:
page[-1]=self.ip_to_int(is_ip[0])
pagelist2['ip'].append(page)
elif re.search('^\d+$', userid):
page[-1]=int(userid)
pagelist2['id'].append(page)
for setname in ['ip','id']:
pagelist2[setname]=sorted(pagelist2[setname],key=operator.itemgetter(-1))
users_shadow=[x[-1] for x in pagelist2[setname]]
inx_list=self.has_less_edits_than(setname=setname,
usernames=users_shadow,ips_gt=ips_gt,ids_gt=ids_gt)
for i in range(len(inx_list)):
if inx_list[i]:
results.append(pagelist2[setname][i])
return results
def has_less_edits_than(self, setname='ip',
usernames=[], ips_gt=None, ids_gt=None):
"""
usernames = list of names to test. Returned list of bools based on test.
ips_gt=sorted list of base-10 ips with a number of edits
that exceed the number of edits that qualify
them as 'experienced,' and thus should return a false value.
ips_lt=sorted list of userids, same as above
limit = not implemented yet. in future, will automate creation
and use of ips_gt, ips_lt from the folder where editcounts were tallied by user.
"""
#returns a list of only the users which have *less* edits than the limit
self.activate_gt(ips_gt,ids_gt)
results=[]
userlist=usernames
len_userlist=len(userlist)
gtlist=sorted(self.gt[setname]) #both gtlist and userlist should supposedly be sorted and of the same type by this line, making the following algorithm pretty efficient.
len_gtlist=len(gtlist)
user_cursor=0
gt_cursor=0
last_res=0
print(setname,len_userlist)
bcs=0
ds=0
while user_cursor!=len_userlist:
user=userlist[user_cursor]
print(len_userlist, user_cursor, len_gtlist,gt_cursor)
gtpos=gtlist[gt_cursor]
#if user==104025: print('a',user,gtpos,user_cursor,gt_cursor)
#104523
if gtpos < user:
if last_res==-1:
user_cursor+=1
results.append(True)
last_res=0
#if user==104025: print('bI',user,gtpos,user_cursor,gt_cursor)
bcs+=1
else:
if gt_cursor+1<len_gtlist:
gt_cursor+=1
last_res=1
else:
user_cursor+=1
#if user==104025: print('bII',user,gtpos,user_cursor,gt_cursor)
elif gtpos > user:
if last_res==1:
user_cursor+=1
results.append(True)
last_res=0
#if user==104025: print('cI',user,gtpos,user_cursor,gt_cursor)
bcs+=1
else:
if gt_cursor>0:
gt_cursor-=1
last_res=-1
else:
user_cursor+=1
#if user==104025: print('cII',user,gtpos,user_cursor,gt_cursor)
elif gtpos == user:
results.append(False)
user_cursor+=1
last_res=0
#if user==104025: print('d',user,gtpos,user_cursor,gt_cursor)
ds+=1
print('d',ds,'bc',bcs)
return results
utility.py
editI know, I know, more descriptive names, I'll give it one. This is just a set of toolbox functions I typically carry with me everywhere
#utility.py
#V1
DEBUG=True
import pickle, textwrap, os, csv
from glob import glob
def pickle_data(file_addr, data):
f_pickle=open(file_addr,'wb')
pickle.dump(data, f_pickle)
f_pickle.flush()
f_pickle.close()
def unpickle_data(file_addr, defaultobject=None):
if os.access(file_addr, os.R_OK):
return pickle.load(open(file_addr,'rb'))
else:
data=defaultobject
pickle_data(file_addr,data)
return data
def flatten_list(list_item):
product=list()
for x in list_item:
if type(x) != list:
product.append(x)
elif list in [type(y) for y in x]:
product.extend(flatten_list(x))
else:
product.extend(x)
return product
def glob_list(args1):
args2=[]
for arg in args1:
args2.extend(glob(arg))
return args2
def dbgmsg(text,links=False):
if DEBUG:
if links:
print(" DEBUG: " + text)
else:
print(textwrap.fill(" DEBUG: " + text))
def csv_write(filename, rowlist):
f_csv=open(filename,'w')
writer=csv.writer(f_csv)
writer.writerows(rowlist)
f_csv.flush()
f_csv.close()
return True
def csv_read(filename, isfile=True):
if isfile:
f_csv=open(filename, 'r')
reader=csv.reader(f_csv)
else:
reader=csv.reader(filename)
rowlist=[]
for row in reader:
rowlist.append(row)
del reader
if isfile: f_csv.close()
return rowlist
serch.py
editthis is the way to update editor data from the website realtime. Incredibly slow, and server heavy. That's why you only use this on the list of pages which had a single editor as of your most recent version of the stub-meta-history file. Because then it is about 1/26th the number of files to check and it doesn't take several months and dozens of gb of transfer.
#!/bin/python
#tool for checking real time from a list of wikipage titles
#whether the page has more than one contributor,
#is a redirect, or has templates, and such things.
#but because this tool is rather slow and heavy on
#the server load... better to use it on small list
#of wikipages just to keep them up2date.
import csv
from urllib.parse import quote
import os
import sys
import re
from hashlib import md5
from utility import *
def wget(link,outfile):
os.system('wget -q "' + link + '" -O "' + outfile + '"')
def make_urls():
#URL addresses for finding out information about pages.
url_book = {
'current' : {
'prefix':'http://en.wikipedia.org/w/index.php?title=Special:Export&pages=',
'suffix':'&limit=1&action=submit'
},
'data' : {
'prefix':'http://en.wikipedia.org/w/index.php?title=Special:Export&pages=',
'suffix':'&limit=10&action=submit&history'
}
}
return url_book
def get_specific_link(url_book, pagename):
link = dict()
for linkaddr in ['current','data']:
link[linkaddr]=url_book[linkaddr]['prefix'] + \
pagename + url_book[linkaddr]['suffix']
return link
def read_link(url_to_get, localaddr):
#returns file handle of a page
#downloaded from the internet
#to location 'localaddr'.
os.system('rm ' + localaddr)
wget(url_to_get,localaddr)
pagesrc = open(localaddr,'r')
return pagesrc
"""class HistoryChecker():
def __init__(self):
def load_from_web(self, web_addr):
dbgmsg('getting contributors')
f_contrib=read_link(web_addr,'/tmp/contrib.txt')
dbgmsg('done')
self.contrbrs=f_contrib.readlines()
self.contrbrs=[re.sub('^.*?\t(.*?)\t.*','\g<1>',x).rstrip() for x in self.contrbrs]
self.contrbrs=self.de_bot(self.contrbrs)
f_contrib.close()
return True
def gauntlet(self, level=0):
if level >=0:
for test in [self.check_max_editors,
self.check_min_editors]:
if not test(self.contrbrs): return False
#if level >=1:
# for test in [self.check_editor_bg]:
# if not test(): return False
#if level >=2:
# pass
dbgmsg("PASSED level " + str(level) + " contributor check.")
return True"""
class ContentChecker():
def __init__(self):
f_bot=open('bot_list.txt', 'r')
self.bot_list = f_bot.readlines()
self.bot_list=[x.rstrip().lower() for x in self.bot_list]
def test_if_redirect(self, pagename, web_addr):
f_page = read_link(web_addr,'/tmp/x.xml')
data=f_page.read()[:2750]
if not re.search('<title>(.+?)</title>',data):
self.is_not_redirect=False #okay, well technically it's probably a defunct page, but whatever, nomenclature later...
return self.is_not_redirect
if quote(re.search('<title>(.+?)</title>',data).group(1)) == pagename:
if not re.search(">\s*\#redirect(\s|$)", data.lower()):
print("not a redirect")
self.is_not_redirect=True
return self.is_not_redirect
print("a redirect")
self.is_not_redirect=False
return self.is_not_redirect
def load_from_web(self, web_addr):
if self.is_not_redirect:
dbgmsg('getting content') #if we wanted to read content
#from database, this is where we'd do it instead.
#the parameter would be something like pagename instead.
f_page = read_link(web_addr,'/tmp/x.xml')
self.data = f_page.read().lower()
self.editors=self.get_editors(self.data)
else:
self.data =''
self.editors=''
return True
def gauntlet(self, level=0):
if not self.is_not_redirect: return False
if level>=0:
for test in [self.check_still_exists]:
#self.check_not_redirect]:
if not test(self.data):return False
for test in [self.check_max_editors]:
if not test(self.editors):return False
if level>=1:
for test in [self.check_no_template]:
if not test(self.data): return False
if level>=2:
pass
dbgmsg("PASSED level " + str(level) + " content check.")
return True
def de_bot(self, usernames):
usernames2=[]
for name in usernames:
if 'bot' not in name[-5:].lower() and \
name not in self.bot_list:
usernames2.append(name)
return usernames2
#def check_not_redirect(self, pagedata):
# if re.search("\n\s*\#redirect(\s|$)", pagedata):
# dbgmsg("X: wiki page is a redirect")
# return False
# return True
def get_editors(self,pagedata, revision_count=9):
#suggested: pagedata incl at least 5 revisions
pagedata2=pagedata.split('\n')
editors=set()
contributor_block=False
for line in pagedata2:
if '<contributor>' in line:
contributor_block=True
elif not contributor_block:
continue
elif '</contributor>' in line:
contributor_block=False
if len(editors)==revision_count:
print(repr(editors))
break
elif '<username>' in line:
editors.add(re.sub('^\s*<username>(((?!username>).)*)</username>\s*$','\g<1>',line))
elif '<ip>' in line:
editors.add(re.sub('^\s*<ip>(((?!ip>).)*)</ip>\s*$','\g<1>',line))
return self.de_bot(editors)
def check_still_exists(self, pagedata):
#this is only useful if our source of content data
#is more recent than our page title list. Say if we're getting
#content live from wikipedia's "special:export" function.
pagehash=md5(self.data.encode())
if pagehash.hexdigest() in ['caa3fe485e6f6518af1e5ea59e131f68','3a98a2e740d741a7750f034a99e70025','f8f49e37b4c4bff5ecac639237a0129f']:
#the hash of the uppercased XML returned when you use the URL
#of a non-existent page.
dbgmsg("X: wiki page no longer exists")
return False
else:
print(pagehash.hexdigest())
return True
def check_no_template(self, pagedata):
if re.search("{{", pagedata):
dbgmsg("X: has a template")
return False
return True
def check_max_editors(self, contributors):
if len(contributors) > 1:
dbgmsg("X: >1 contributors")
print(repr(contributors))
return False
print(repr(contributors))
return True
def check_min_editors(self,contributors):
#this test may be excluded if you think it's
#important to check bot created pages for sanity
if len(contributors)==0:
dbgmsg("X: only bot contributors")
return False
return True
def main(titlefile):
loc_addrfile='stored_data.pickle'
lastloc=unpickle_data(loc_addrfile,0)
url_book=make_urls()
loc_output='./results/results'
#loop is designed around iterating through the title file,
#not through a variable holding all its data.
#this means we can loop thru large title files (which would
#freeze us up if put in memory.
f_titles = open(titlefile, 'r')
f_titles.seek(0, 2) #find the byte address of the end of file.
loc_end_of_file=f_titles.tell()
f_titles.seek(lastloc)
l=0
while f_titles.tell() < loc_end_of_file:
one_author_only=[] #temp repository of pages that we've found
#to have one author
handful = [] #handful of pages to check
dbgmsg("getting titles")
for i in range(0,100):
line=f_titles.readline()
if line:
handful.append(line.rstrip()) #assumes title list is already quoted
contentcheck=ContentChecker()
#historycheck=HistoryChecker()
for pagename in handful:
#build the URL addresses for getting data about the page
#makes link['h'] -> 'http://...' (url for page history)
link = get_specific_link(url_book, pagename)
dbgmsg(str(lastloc)+'page addr:' + link['current'],links=True)
valid=contentcheck.test_if_redirect(pagename=pagename,web_addr=link['current'])
if not valid: continue
dbgmsg(str(lastloc)+'page addr:' + link['data'],links=True)
contentcheck.load_from_web(link['data'])
valid = contentcheck.gauntlet()
if not valid: continue #next pagename
#historycheck.load_from_web(link['contrib'])
#valid = historycheck.gauntlet()
#if not valid: continue #next pagename
one_author_only.append(pagename+'\n')
dbgmsg("adding new data")
f_results=open(loc_output+str(lastloc)+'.txt','w')
f_results.writelines(one_author_only)
f_results.flush()
f_results.close()
lastloc=int(f_titles.tell())
print('read 50 pages\' history, of which ',
str(len(one_author_only)),
' met conditions. We are at:', lastloc)
dbgmsg("storing data")
pickle_data(loc_addrfile, lastloc)
l+=1
if __name__ == '__main__':
main(titlefile=sys.argv[1])
"""
This program updates via the internet all the suspected one author pages to see whether it's true. It breaks the list down into a bunch of files is the results folder. The list contains all the files which really seem to be one author only still. Concatenate them into one file after by doing...
python3.0 serch.py
cd results
cat *.txt > ../one_author_pages.title
You'll probably want to change this into a pageset so you can remove pages with experienced authors, so here we go, here's how to work backwords and do that.
cd ..
python
import utility
data_based=utility.csv_read('one_author_pages_prelim.csv')
int_based=open('one_author_pages.title')
r2=[x.rstrip() for x in int_based.readlines()]
dictform={}
for page in data_based:
dictform[page[0]]=page[1:]
for page in r2:
if page not in dictform:
print(page)) #should return none, as int_based was just a narrowing down of data_based
newcsv=[]
for page in r2:
a=[page]
a.extend(dictform[page])
newcsv.append(a)
utility.csv_write('One_author_Pageset.csv',newcsv)
"""
shell commands
edita couple of shell commands I made use of... I need to integrate these into the code, even though it will take more lines when using python. But basically they seem random and unintuitive but they're mostly for quickly converting from pageset to title list or dealing with editcount stuff.
#from /opt/editcounts/*
grep -hiE '^[0123456789]+\,([0123456789]{1,2})\s*$' edits.ids.*.txt > ids_lt_99_edits
grep -hiEv '^[0123456789]+\,([0123456789]{1,2})\s*$' edits.ids.*.txt > ids_gt_99_edits
grep -hiE '^[0123456789]+\,([0123456789]{1,2})\s*$' edits.ips.*.txt > ips_lt_99_edits
grep -hiEv '^[0123456789]+\,([0123456789]{1,2})\s*$' edits.ips.*.txt > ips_gt_99_edits
grep -E "^[^,]+,[0-9]+\s*$" limited.txt > updated_list_as_pageset_bot_made_only
grep -Ev "^[^,]+,[0-9]+\s*$" limited.txt > updated_list_as_pageset_with_humans
sed -r 's/,[0-9]+\s*$//g' ips_gt_99_edits > iplist_gt_edits
#just a list of base 10 ips, doesn't include editcounts
sed -r 's/,[0-9]+\s*$//g' ids_gt_99_edits > idlist_gt_edits
#just a list of ids, doesn't include editcounts
get_redirects.py
editdeals with the enwiki-pages.sql file to get a list of redirects for wiki_pageset.py usually called on its own with a little bit of customization.
import re, random, sqlite3
import pageparser_db
from urllib.parse import quote
d=open('enwiki-20081008-page.sql')
#d=open('page.sql')
redirects = []
d.seek(0,2)
eof_loc = d.tell()
d.seek(0)
i=0
#base, cu=pageparser_db.connect_redirect_base()
f_r=open('redirect_list','w')
#initial page id only? i dunno, seems like it might be good to check for both though, cause this definitely removed some when I used it initially.
"""
while d.tell() < eof_loc:
content=d.read(1000000)
redirect_data=re.findall("\((\d+),\d+,\'.+?\',\'.*?\',\d+,(\d)", content)
for article in redirect_data:
if int(article[1])==1:
if random.randint(1,10000)==500:
redirects.append(article[0])
del redirect_data
i+=1
print("ahoy", str(i))
"""
#title paired with is_redirect
while d.tell() < eof_loc:
content=d.read(40000000)
redirect_data=re.findall("\(\d+,\d+,\'(.+?)\',\'.*?\',\d+,1", content)
for i in range(len(redirect_data)):
redirect_data[i]=quote(re.sub('_', ' ', redirect_data[i]))+'\n'
f_r.writelines(redirect_data)
del redirect_data
i+=1
if i>5:
f_r.flush()
print("ahoy", str(i))
print("we are at", str(int(100*d.tell()/float(eof_loc))), "%...")
"""
#here, the redirects field comes before the page_latest_id field,
#so we use article 0.
while d.tell() < eof_loc:
content=d.read(40000000)
redirect_data=re.findall("\(\d+,\d+,'.+?','.*?',\d+,1,\d+,[\d\.]+?,'\d+?',(\d+)", content)
for i in range(len(redirect_data)):
redirect_data[i]=redirect_data[i]+'\n'
f_r.writelines(redirect_data)
del redirect_data
i+=1
if i>5:
f_r.flush()
print("we are at", str(int(100*d.tell()/float(eof_loc))), "%...")
"""
f_r.flush()
f_r.close()
list of bots
editbot list used can be found here. tho you'll probably want the more recent version from the category page.