# python pagegenerators.py -transcludes:"Infobox drug" -ns:0 > drugbox.txt
import codecs
import mwparserfromhell
import re
import wikipedia
def param(template,parameter):
try:
value = to_unicode(template.get(parameter).value.strip())
return value
except ValueError:
value = to_unicode("")
return value
def to_unicode(obj, encoding='utf-8'):
if isinstance(obj, basestring):
if not isinstance(obj, unicode):
obj = unicode(obj, encoding)
return obj
def get_param_value(template, param_name):
param_value = param(template,param_name).splitlines()
if param_value:
param_value = param_value[0].encode('utf-8')
return param_value
else:
param_value = "".encode('utf-8')
return param_value
articles = []
articles = codecs.open('/Users/bogbot/progs/compat/drugbox.txt', mode = 'r', encoding='utf-8')
#articles = ['Amphetamine']
seq = ("article", "IUPAC_name", "CAS_number", "IUPHAR_ligand", "ChemSpiderID", "UNII", "KEGG", "ChEBI", "StdInChI", "StdInChIKey")
str = '\t'
print str.join(seq)
for article in articles:
article = article.strip()
log_string = "* [[" + article + "]], "
article = to_unicode(article)
site = wikipedia.getSite()
page = wikipedia.Page(site, article)
text = page.get(get_redirect = True)
wikicode = mwparserfromhell.parse(text)
templates = wikicode.filter_templates()
for template in templates:
template_name = template.name.strip().lower()
if (template_name == "drugbox" or template_name == "infobox drug"):
IUPAC_name = get_param_value(template,'IUPAC_name')
CAS_number = get_param_value(template,'CAS_number')
IUPHAR_ligand = get_param_value(template,'IUPHAR_ligand')
ChemSpiderID = get_param_value(template,'ChemSpiderID')
UNII = get_param_value(template,'UNII')
KEGG = get_param_value(template,'KEGG')
ChEBI = get_param_value(template,'ChEBI')
ChEMBL = get_param_value(template,'ChEMBL')
StdInChI = get_param_value(template,'StdInChI')
StdInChIKey = get_param_value(template,'StdInChIKey')
seq = (article.encode('utf-8'), IUPAC_name, CAS_number, IUPHAR_ligand, ChemSpiderID, UNII, KEGG, ChEBI, StdInChI, StdInChIKey)
# print seq
str = '\t'
print str.join(seq)