Wikipedia:Database reports/Articles containing invalid template parameters/Configuration

bullshitparams.py

edit
#! /usr/bin/env python
# Public domain; MZMcBride; 2011

import datetime
import codecs
import re
import MySQLdb
import wikitools
import settings

def get_target_templates_list():
    return ['Infobox_officeholder']

def get_template_parameters_from_template(template):
    template_parameters = set()
    template_text = wikitools.Page(wiki, 'Template:'+template).getWikiText()
    legal_chars = r'[ %!"$&\'()*,\-.0-9:;?@A-Z^_`a-z~\x80-\xFF]'
    legal_chars_spaceless = r'[%!"$&\'()*,\-.0-9:;?@A-Z^_`a-z~\x80-\xFF]'
    dynamic_parameter_re = re.compile(r'('+
                                      legal_chars_spaceless + '+' +
                                      r')\{\{#if:\{\{\{(' +
                                      legal_chars + '+' +
                                      r')\|\}\}\}\|(' +
                                      legal_chars + '*' +
                                      r')\|(' +
                                      legal_chars + '*' +
                                      r')\}\}(' +
                                      legal_chars + '+' +
                                      r')')
    for match in dynamic_parameter_re.finditer(template_text):
        parameter_name_1 = match.group(1)+match.group(3)+match.group(5)
        parameter_name_2 = match.group(1)+match.group(4)+match.group(5)
        template_parameters.add(parameter_name_1)
        template_parameters.add(parameter_name_2)
    parameter_re = re.compile(r'\{\{\{([ %!"$&\'()*,\-.0-9:;?@A-Z^_`a-z~\x80-\xFF]+)(\||\})', re.I|re.MULTILINE)
    for match in parameter_re.finditer(template_text):
        template_parameters.add(match.group(1).strip())
    return template_parameters

def get_articles_list(cursor, template):
    articles_list = []
    cursor.execute('''
                   /* bullshitparams.py SLOW_OK */
                   SELECT
                     page_title
                   FROM page
                   JOIN templatelinks
                   ON tl_from = page_id
                   WHERE tl_namespace = 10
                   AND tl_title = %s
                   AND page_namespace = 0
                   AND page_is_redirect = 0;
                   ''' , template)
    for row in cursor.fetchall():
        article = unicode(row[0], 'utf-8')
        articles_list.append(article)
    return articles_list

def get_template_parameters_from_article(article, templates):
    article_parameters = set()
    inner_template_re = re.compile(r'\{\{[^}]+\}\}', re.I|re.MULTILINE)
    parameter_re = re.compile(r'\|\s*([ %!"$&\'()*,\-.0-9:;?@A-Z^_`a-z~\x80-\xFF]+)\s*=', re.I|re.MULTILINE)
    article_text = wikitools.Page(wiki, article).getWikiText()
    for template in templates:
        template_re = re.compile(r'\{\{\s*%s\s*(.*?)\}\}' % template.replace('_', r'[\s_]*'), re.I|re.MULTILINE|re.DOTALL)
        if not template_re.search(article_text):
            continue
        string_start_position = template_re.search(article_text).start()
        shit_re = re.compile(r'(\{\{|\{\{\{|\}\}|\}\}\})')
        start_shit_re = re.compile(r'(\{\{|\{\{\{)')
        end_shit_re = re.compile(r'(\}\}|\}\}\})')
        start_matches = 0
        end_matches = 0
        for match in shit_re.finditer(article_text[string_start_position:]):
            if start_shit_re.search(match.group(0)):
                start_matches += 1
            elif end_shit_re.search(match.group(0)):
                string_end_position = match.end()
                end_matches += 1
            if start_matches == end_matches:
                template_content = article_text[string_start_position:string_end_position+string_start_position]
                for match in inner_template_re.finditer(template_content[2:]):
                    template_content = re.sub(re.escape(match.group(0)), '', template_content)
                break
        for match in parameter_re.finditer(template_content):
            article_parameter = match.group(1).strip()
            article_parameters.add(article_parameter)
    return article_parameters

report_title = settings.rootpage + 'Articles containing bullshit template parameters'

report_template = u'''\
Articles containing bullshit template parameters (limited to approximately \
the first 1000 entries); data as of <onlyinclude>%s</onlyinclude>.

{| class="wikitable sortable plainlinks" style="width:100%%; margin:auto;"
|- style="white-space:nowrap;"
! No.
! Page
! Parameter
|-
%s
|}
'''

wiki = wikitools.Wiki(settings.apiurl); wiki.setMaxlag(-1)
wiki.login(settings.username, settings.password)

conn = MySQLdb.connect(host=settings.host,
                       db=settings.dbname,
                       read_default_file='~/.my.cnf')
cursor = conn.cursor()

target_templates = get_target_templates_list()

bullshit_parameters = []

f = codecs.open('%sbullshit-reviewed-page-titles.txt' % settings.path, 'r', 'utf-8')
reviewed_page_titles = f.read()
reviewed_page_titles_list = reviewed_page_titles.split('\n')
f.close()

g = codecs.open('%sbullshit-reviewed-page-titles.txt' % settings.path, 'a', 'utf-8')

count = 1
for template in target_templates:
    if count > 1000:
        break
    articles_list = get_articles_list(cursor, template)
    template_parameters = get_template_parameters_from_template(template)
    for article in articles_list:
        if count > 1000:
            break
        if article in reviewed_page_titles:
            continue
        article_parameters = get_template_parameters_from_article(article, target_templates)
        bullshit_parameters_count = 0
        for i in article_parameters-template_parameters:
            bullshit_parameters.append([article, i])
            count += 1
            bullshit_parameters_count += 1
        if bullshit_parameters_count == 0:
            g.write(article+'\n')
g.close()

i = 1
output = []
for bullshit_parameter in bullshit_parameters:
    page_title = u'{{dbr link|1='+bullshit_parameter[0].replace('_', ' ')+u'}}'
    parameter = bullshit_parameter[1]
    table_row = u'''| %d
| %s
| %s
|-''' % (i, page_title, parameter)
    output.append(table_row)
    i += 1

cursor.execute('''
               SELECT
                 UNIX_TIMESTAMP() - UNIX_TIMESTAMP(rc_timestamp)
               FROM recentchanges
               ORDER BY rc_timestamp DESC
               LIMIT 1;
               ''')
rep_lag = cursor.fetchone()[0]
time_diff = datetime.datetime.utcnow() - datetime.timedelta(seconds=rep_lag)
current_of = time_diff.strftime('%H:%M, %d %B %Y (UTC)')

report = wikitools.Page(wiki, report_title)
report_text = report_template % (current_of, '\n'.join(output))
report_text = report_text.encode('utf-8')
report.edit(report_text, summary=settings.editsumm, bot=1)

cursor.close()
conn.close()

crontab

edit
50 2 * * 6 PYTHONPATH=$HOME/scripts python $HOME/scripts/database-reports/bullshitparams.py > /dev/null