Wikiblame
editHere's a hack to find who put some nefarious text into an article. Please let me know if you have any comments or suggestions.
#!/usr/bin/perl -w
# FILE: wiki-blame.pl
# AUTHOR: TotoBaggins on en.wikipedia.org
#
# LICENSE: GPL
#
# DESCRIPTION: This program outputs a URL of the first revision of an article
# to contain a particular bit of text. We only download log_2 N articles,
# so it's reasonably fast and kind to the servers.
#
# USAGE:
#
# ./wiki-blame.pl {article or history URL} {offending text}
#
# The URL must be url-encoded.
# The offending text should be quoted as a single argument.
#
# EXAMPLES:
#
# Find which revision inserted some text at or before an old revision
# of the C++ article:
#
# ./wiki-blame.pl 'http://en.wikipedia.org/w/index.php?title=C%2B%2B&oldid=101608911' 'Evil Text'
#
#
# Find which revision inserted some text at or before the current revision
# of the C++ article:
#
# ./wiki-blame.pl 'http://en.wikipedia.org/wiki/C%2B%2B' 'Evil Text'
#
#
# BUGS:
#
# -- We only look back 2000 edits worth.
# -- We could be politer and faster if we stepped back through
# history exponentially.
# -- We are too dependent on wikipedia.org's URL layout.
#
use strict;
use LWP::UserAgent;
use HTML::Parser;
use HTML::LinkExtor;
use Carp;
use Data::Dumper;
my $WebAgent = LWP::UserAgent->new(keep_alive => 1);
sub get_page
{
my $url = shift;
my $response = $WebAgent->get($url);
$response->is_success() or croak $response->status_line();
return $response->content();
}
sub get_links
{
my $url = shift;
my $url_front = $url;
$url_front =~ s,^([^/]+//[^/]+)/.*,$1,;
my $page = get_page($url);
my $linky = HTML::LinkExtor->new();
$linky->utf8_mode(1);
$linky->parse($page) or croak "Can't parse: $page";
my @urls;
foreach my $link ($linky->links())
{
my ($tag, %links) = @$link;
my $url = $links{href} or next;
push @urls, "$url_front$url";
}
return @urls;
}
use constant YES_MARKER => 100;
use constant NO_MARKER => 0;
{
my %MarkerCache;
sub url_has_text
{
my ($url, $text) = @_;
unless (defined $MarkerCache{$url})
{
my $page = get_page($url);
use File::Slurp;
$url =~ /oldid=(\d+)/;
write_file("oldid-$1.html", $page);
$MarkerCache{$url} = index($page, $text) >= 0
? YES_MARKER : NO_MARKER;
}
return $MarkerCache{$url};
}
}
# This is from List::Search, which had a bug. It can be
# removed when "List::Search::nlist_search(2, [2, 2, 2])" returns 0 and not 1
sub custom_list_search
{
my ($cmp_code, $key, $array_ref) = @_;
my $max_index = scalar(@$array_ref) - 1;
my $low = 0;
my $mid = undef;
my $high = $max_index;
my $lowest_match = undef;
while ($low <= $high)
{
$mid = int($low + (($high - $low) / 2));
my $mid_val = $array_ref->[$mid];
my $cmp_result = $cmp_code->($key, $mid_val);
if ($cmp_result > 0)
{
$low = $mid + 1;
}
else
{
if ($cmp_result == 0
&& (!defined($lowest_match) || $lowest_match > $mid))
{
$lowest_match = $mid;
}
$high = $mid - 1;
}
}
# Look at the values here and work out what to return.
# Perhaps there are no matches in the array
return -1 if $cmp_code->($key, $array_ref->[-1]) == 1;
# Perhaps $mid is just before the best match
return $mid + 1 if $cmp_code->($key, $array_ref->[$mid]) == 1;
# $mid is correct
return $mid;
}
sub snarf_history_urls
{
my ($article_url, $limit) = @_;
my $idx_url = $article_url;
$idx_url =~ s/\&oldid=(\d+)$/\&action=history&limit=$limit/
||
$idx_url =~ s,/wiki/(.+),/w/index.php?title=$1&limit=$limit&action=history,
or die $idx_url;
my @all_urls = get_links($idx_url);
my @history_urls;
foreach my $url (@all_urls)
{
# only old article urls
next unless $url =~ m,/index.php\?title=[^&]+&oldid=\d+$,;
push @history_urls, $url;
}
# make chronological
@history_urls = reverse @history_urls;
return @history_urls;
}
sub get_first_history_url
{
my $url = shift;
return $url if $url =~ /&oldid=\d/;
my @history_urls = snarf_history_urls($url, 1);
return $history_urls[0];
}
sub find_rev_that_added
{
my ($offending_history_url, $offending_text) = @_;
my $history_index_url = $offending_history_url;
# FIXME: we limit it to 2000 urls to be kind to the wikiservers
# We should really bite off pieces of history stepwise.
my $max_urls = 2000;
my @history_urls = snarf_history_urls($offending_history_url, $max_urls);
$offending_history_url =~ /\&oldid=(\d+)/ or die $offending_history_url;
my $offending_id = $1;
# my %url2index;
my $saw_offender;
my @before_offender_urls;
foreach my $url (@history_urls)
{
# $url2index{$url} = @before_offender_urls;
push @before_offender_urls, $url;
last if $url =~ /\&oldid=$offending_id\b/;
}
my $url2marker = sub {
my ($key, $url) = @_;
my $has_it = url_has_text($url, $offending_text);
my $ret = $key <=> $has_it;
# warn "has($has_it), ret($ret), u2i($url2index{$url}), $url\n";
return $key <=> $has_it;
};
my $first_with = custom_list_search($url2marker, YES_MARKER,
\@before_offender_urls);
return unless $first_with >= 0;
if ($first_with == $max_urls)
{
warn "Warning: It looks like that edit occurred further in "
. "the past than I feel comfortable looking (beyond "
. "$max_urls edits).\n";
return;
}
return $before_offender_urls[$first_with];
}
@ARGV == 2 or die "usage: $0 {article or history URL} {offending text}\n";
my $url = shift;
my $offending_text = shift;
my $offending_history_url = get_first_history_url($url);
if (my $found_url = find_rev_that_added($offending_history_url,
$offending_text))
{
if ($found_url eq $offending_history_url)
{
print "No earlier revisions found.\n";
}
else
{
print "Earliest revision: $found_url\n";
}
}
else
{
print "Not found\n";
}