User:Woodsstock/Blame

Wikiblame

Here's a hack to find who put some nefarious text into an article. Please let me know if you have any comments or suggestions.
#!/usr/bin/perl -w
# FILE: wiki-blame.pl
# AUTHOR: TotoBaggins on en.wikipedia.org
# 
# LICENSE: GPL
#
# DESCRIPTION: This program outputs a URL of the first revision of an article
# to contain a particular bit of text.  We only download log_2 N articles,
# so it's reasonably fast and kind to the servers.
# 
# USAGE:
#
#    ./wiki-blame.pl {article or history URL} {offending text}
#
# The URL must be url-encoded.
# The offending text should be quoted as a single argument.
# 
# EXAMPLES:
#
# Find which revision inserted some text at or before an old revision
# of the C++ article:
# 
#   ./wiki-blame.pl 'http://en.wikipedia.org/w/index.php?title=C%2B%2B&oldid=101608911' 'Evil Text'
#
#
# Find which revision inserted some text at or before the current revision
# of the C++ article:
#
#   ./wiki-blame.pl 'http://en.wikipedia.org/wiki/C%2B%2B' 'Evil Text'
#
# 
# BUGS:
#
# -- We only look back 2000 edits worth.
# -- We could be politer and faster if we stepped back through
#    history exponentially.
# -- We are too dependent on wikipedia.org's URL layout.
# 

use strict;
use LWP::UserAgent;
use HTML::Parser;
use HTML::LinkExtor;
use Carp;
use Data::Dumper;

my $WebAgent = LWP::UserAgent->new(keep_alive => 1);

sub get_page
{
    my $url = shift;
    my $response = $WebAgent->get($url);
    $response->is_success() or croak $response->status_line();
    return $response->content();
}

sub get_links
{
    my $url = shift;
    my $url_front = $url;
    $url_front =~ s,^([^/]+//[^/]+)/.*,$1,;
    my $page = get_page($url);
    my $linky = HTML::LinkExtor->new();
    $linky->utf8_mode(1);
    $linky->parse($page) or croak "Can't parse: $page";
    my @urls;
    foreach my $link ($linky->links())
    {
        my ($tag, %links) = @$link;
        my $url = $links{href} or next;
        push @urls, "$url_front$url";
    }
    return @urls;
}


use constant YES_MARKER => 100;
use constant NO_MARKER  => 0;

{
    my %MarkerCache;
    sub url_has_text
    {
        my ($url, $text) = @_;
        unless (defined $MarkerCache{$url})
        {
            my $page = get_page($url);
            use File::Slurp;
            $url =~ /oldid=(\d+)/;
            write_file("oldid-$1.html", $page);
            $MarkerCache{$url} = index($page, $text) >= 0
                                    ? YES_MARKER : NO_MARKER;
        }
        return $MarkerCache{$url};
    }
}

# This is from List::Search, which had a bug.  It can be
# removed when "List::Search::nlist_search(2, [2, 2, 2])" returns 0 and not 1
sub custom_list_search
{
    my ($cmp_code, $key, $array_ref) = @_;

    my $max_index = scalar(@$array_ref) - 1;

    my $low  = 0;
    my $mid  = undef;
    my $high = $max_index;
    my $lowest_match = undef;

    while ($low <= $high)
    {
        $mid = int($low + (($high - $low) / 2));
        my $mid_val = $array_ref->[$mid];

        my $cmp_result = $cmp_code->($key, $mid_val);

        if ($cmp_result > 0)
        {
            $low = $mid + 1;
        }
        else
        {
            if ($cmp_result == 0
                 && (!defined($lowest_match) || $lowest_match > $mid))
            {
                $lowest_match = $mid;
            }
            $high = $mid - 1;
        }
    }

    # Look at the values here and work out what to return.
    # Perhaps there are no matches in the array
    return -1 if $cmp_code->($key, $array_ref->[-1]) == 1;

    # Perhaps $mid is just before the best match
    return $mid + 1 if $cmp_code->($key, $array_ref->[$mid]) == 1;

    # $mid is correct
    return $mid;
}

sub snarf_history_urls
{
    my ($article_url, $limit) = @_;

    my $idx_url = $article_url;
    $idx_url =~ s/\&oldid=(\d+)$/\&action=history&limit=$limit/
        ||
    $idx_url =~ s,/wiki/(.+),/w/index.php?title=$1&limit=$limit&action=history,
        or die $idx_url;

    my @all_urls = get_links($idx_url);

    my @history_urls;
    foreach my $url (@all_urls)
    {
        # only old article urls
        next unless $url =~ m,/index.php\?title=[^&]+&oldid=\d+$,;

        push @history_urls, $url;
    }
    # make chronological
    @history_urls = reverse @history_urls;

    return @history_urls;
}

sub get_first_history_url
{
    my $url = shift;
    return $url if $url =~ /&oldid=\d/;
    my @history_urls = snarf_history_urls($url, 1);
    return $history_urls[0];
}

sub find_rev_that_added
{
    my ($offending_history_url, $offending_text) = @_;
    my $history_index_url = $offending_history_url;

    # FIXME: we limit it to 2000 urls to be kind to the wikiservers
    # We should really bite off pieces of history stepwise.
    my $max_urls = 2000;
    my @history_urls = snarf_history_urls($offending_history_url, $max_urls);

    $offending_history_url =~ /\&oldid=(\d+)/ or die $offending_history_url;
    my $offending_id = $1;

#     my %url2index;
    my $saw_offender;
    my @before_offender_urls;
    foreach my $url (@history_urls)
    {
#         $url2index{$url} = @before_offender_urls;
        push @before_offender_urls, $url;
        last if $url =~ /\&oldid=$offending_id\b/;
    }

    my $url2marker = sub {
        my ($key, $url) = @_;
        my $has_it = url_has_text($url, $offending_text);
        my $ret = $key <=> $has_it;
#         warn "has($has_it), ret($ret), u2i($url2index{$url}), $url\n";
        return $key <=> $has_it;
    };
    my $first_with = custom_list_search($url2marker, YES_MARKER,
                                        \@before_offender_urls);
    return unless $first_with >= 0;

    if ($first_with == $max_urls)
    {
        warn "Warning: It looks like that edit occurred further in "
             . "the past than I feel comfortable looking (beyond "
             . "$max_urls edits).\n";
        return;
    }

    return $before_offender_urls[$first_with];
}

@ARGV == 2 or die "usage: $0 {article or history URL} {offending text}\n";
my $url = shift;
my $offending_text = shift;

my $offending_history_url = get_first_history_url($url);

if (my $found_url = find_rev_that_added($offending_history_url,
                                        $offending_text))
{
    if ($found_url eq $offending_history_url)
    {
        print "No earlier revisions found.\n";
    }
    else
    {
        print "Earliest revision: $found_url\n";
    }
}
else
{
    print "Not found\n";
}