User:BogBot/Source code/Task 06

# python pagegenerators.py -transcludes:"Infobox drug" -ns:0 > drugbox.txt

import codecs
import mwparserfromhell
import re
import wikipedia

def param(template,parameter):
    try:
        value = to_unicode(template.get(parameter).value.strip())
        return value
    except ValueError:
        value = to_unicode("")
        return value

def to_unicode(obj, encoding='utf-8'):
    if isinstance(obj, basestring):
        if not isinstance(obj, unicode):
            obj = unicode(obj, encoding)
    return obj

def get_param_value(template, param_name):
    param_value = param(template,param_name).splitlines()
    if param_value:
        param_value = param_value[0].encode('utf-8')
        return param_value
    else:
        param_value = "".encode('utf-8')
        return param_value

articles = []
articles = codecs.open('/Users/bogbot/progs/compat/drugbox.txt', mode = 'r', encoding='utf-8')
#articles = ['Amphetamine']

seq = ("article", "IUPAC_name", "CAS_number", "IUPHAR_ligand", "ChemSpiderID", "UNII", "KEGG", "ChEBI", "StdInChI", "StdInChIKey")
str = '\t'
print str.join(seq)

for article in articles:

    article = article.strip()
    log_string = "* [[" + article + "]], " 

    article = to_unicode(article)

    site = wikipedia.getSite()
    page = wikipedia.Page(site, article)
    text = page.get(get_redirect = True)

    wikicode = mwparserfromhell.parse(text)
    templates = wikicode.filter_templates()

    for template in templates:

        template_name = template.name.strip().lower()
        if (template_name == "drugbox" or template_name == "infobox drug"):
            IUPAC_name = get_param_value(template,'IUPAC_name')
            CAS_number = get_param_value(template,'CAS_number')
            IUPHAR_ligand = get_param_value(template,'IUPHAR_ligand')
            ChemSpiderID = get_param_value(template,'ChemSpiderID')
            UNII = get_param_value(template,'UNII')
            KEGG = get_param_value(template,'KEGG')
            ChEBI = get_param_value(template,'ChEBI')
            ChEMBL = get_param_value(template,'ChEMBL')
            StdInChI = get_param_value(template,'StdInChI')
            StdInChIKey = get_param_value(template,'StdInChIKey')
            seq = (article.encode('utf-8'), IUPAC_name, CAS_number, IUPHAR_ligand, ChemSpiderID, UNII, KEGG, ChEBI, StdInChI, StdInChIKey)
#            print seq
            str = '\t'
            print str.join(seq)