User:AnomieBOT/source/tasks/ReplaceExternalLinks5.pm
Approved 2011-12-04. Wikipedia:Bots/Requests for approval/AnomieBOT 60 |
package tasks::ReplaceExternalLinks5;
=pod
=begin metadata
Bot: AnomieBOT
Task: ReplaceExternalLinks5
BRFA: Wikipedia:Bots/Requests for approval/AnomieBOT 60
Status: Approved 2011-12-04
Created: 2011-11-30
OnDemand: true
Add archiveurl for dead or dying links, when an archive can be found at
archive.org or webcitation.org, and optionally tag unarchived links with
{{tl|dead link}} or a similar template.
=end metadata
=cut
use utf8;
use strict;
use Data::Dumper;
use POSIX;
use Date::Parse;
use LWP::UserAgent;
use XML::LibXML;
use HTML::Entities ();
use URI;
use AnomieBOT::Task qw/:time/;
use vars qw/@ISA/;
@ISA=qw/AnomieBOT::Task/;
my $req='[[User:AnomieBOT/req/Gamepro links|request]]';
# Useful character sets
my $chars='[^][<>"\x00-\x20\x7F\p{Zs}]';
my $dchars='[^][<>"\x00-\x20\x7F\p{Zs}/.]';
my $portre=qr!(?::\d+)?!;
# Template for marking dead links. Set undef for no tagging.
my $deadlink=undef; #'dead link';
# euquery values to search for
my @euquery=('*.gamepro.com');
# Regular expression matching links to replace. No protocol.
my $linkre=qr!(?:$dchars+\.)*(?i:gamepro\.com)$portre\/!;
# Description of links
my $desc='Gamepro';
###########################
# Marker to indicate where {{dead links}} should be removed
my $rmdl="\x02*\x03";
# Placeholder for when
my $nodl="\x02x\x03";
# The text part of a bracketed link
my $btext=qr/ *[^\]\x00-\x08\x0a-\x1F]*?/;
# Protocol re
my $proto1=qr!(?:https?:)?//!;
my $proto2=qr!https?://!;
sub new {
my $class=shift;
my $self=$class->SUPER::new();
$self->{'iter'}=undef;
$self->{'ua'}=LWP::UserAgent->new(
agent=>"AnomieBOT link checker for en.wikipedia.org (https://en.wikipedia.org/wiki/Wikipedia:Bots/Requests_for_approval/AnomieBOT_60)",
keep_alive=>300,
);
# Unfortunately, webcite seems to like quoting back the url without
# encoding ampersands in certain error messages.
$self->{'xml'}=XML::LibXML->new(recover=>1);
$self->{'protocols'}=[];
bless $self, $class;
return $self;
}
=pod
=for info
Approved 2011-12-04.<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT 60]]
=cut
sub approved {
return -1;
}
sub run {
my ($self, $api)=@_;
my $res;
$api->task('ReplaceExternalLinks5', 0, 10, qw/d::Redirects d::Templates d::Nowiki/);
my $screwup='Errors? [[User:'.$api->user.'/shutoff/ReplaceExternalLinks5]]';
# Spend a max of 5 minutes on this task before restarting
my $endtime=time()+300;
# Get list of citation templates
my %templates=$api->redirects_to_resolved(
'Template:Citation',
'Template:Citation metadata',
'Template:Cite api',
'Template:Cite book',
'Template:Cite conference',
'Template:Cite IETF',
'Template:Cite interview',
'Template:Cite journal',
'Template:Cite mailing list',
'Template:Cite news',
'Template:Cite press release',
'Template:Cite video',
'Template:Cite web',
'Template:Unicite',
'Template:Vancite conference',
'Template:Vancite journal',
'Template:Vancite news',
'Template:Vancite web',
'Template:Vcite conference',
'Template:Vcite journal',
'Template:Vcite news',
'Template:Vcite web',
);
if(exists($templates{''})){
$api->warn("Failed to get citation template redirects: ".$templates{''}{'error'}."\n");
return 60;
}
# Get regex for finding {{dead link}}
my (%dl,$dlre);
if(defined($deadlink)){
%dl=$api->redirects_to_resolved($deadlink);
if(exists($dl{''})){
$api->warn("Failed to get dead link template redirects: ".$dl{''}{'error'}."\n");
return 60;
}
$dlre='{{(?i:\s*Template\s*:)?\s*(?:'.join('|',map { $_="\Q$_\E"; s/^Template\\:(.)/(?i:$1)/; s/\\ /[ _]/g; $_; } keys %dl).')(?>\s*(?s:\|.*?)?}})';
$dlre=qr/$dlre/;
} else {
%dl=();
$dlre=qr/(*F)x/;
}
$self->{'protocols'}=[qw/http https/] unless @{$self->{'protocols'}};
while(@{$self->{'protocols'}}){
if(!defined($self->{'iter'})){
$self->{'iter'}=$api->iterator(
list => 'exturlusage',
euprop => 'title',
euquery => [@euquery],
euprotocol => shift @{$self->{'protocols'}},
eunamespace => '0',
eulimit => '1000', # exturlusage has issues with big lists
);
}
while(my $pg=$self->{'iter'}->next){
if(!$pg->{'_ok_'}){
$api->warn("Failed to retrieve page list for ".$self->{'iter'}->iterval.": ".$pg->{'error'}."\n");
return 60;
}
return 0 if $api->halting;
my $page=$pg->{'title'};
my $tok=$api->edittoken($page, EditRedir => 1);
if($tok->{'code'} eq 'shutoff'){
$api->warn("Task disabled: ".$tok->{'content'}."\n");
return 300;
}
if($tok->{'code'} ne 'success'){
$api->warn("Failed to get edit token for $page: ".$tok->{'error'}."\n");
next;
}
if(exists($tok->{'missing'})){
$api->warn("WTF? $page does not exist?\n");
next;
}
# Setup flags
$self->{'flags'}={cite=>0,link=>0,404=>0,fail=>0};
my $intxt=$tok->{'revisions'}[0]{'slots'}{'main'}{'*'};
my $outtxt=$intxt;
# Replace the links. First, do citation templates.
my $nowiki;
$outtxt=$api->process_templates($outtxt, sub {
return undef if $self->{'flags'}{'fail'};
my $name=shift;
my $params=shift;
my $wikitext=shift;
my $data=shift;
my $oname=shift;
return undef unless exists($templates{"Template:$name"});
my $ret="{{$oname";
my $archived=0;
my $url='';
my ($accessdate,$date,$year,$month);
$year=$month='XXX';
foreach ($api->process_paramlist(@$params)){
$_->{'name'}=~s/^\s+|\s+$//g;
$_->{'value'}=~s/^\s+|\s+$//g;
if($_->{'name'} eq 'url'){
$url=$_->{'value'};
} elsif($_->{'name'} eq 'accessdate'){
$accessdate=str2time($_->{'value'});
} elsif($_->{'name'} eq 'date'){
$date=str2time($_->{'value'});
} elsif($_->{'name'} eq 'year' && $_->{'value'}=~/^\d+$/){
$year=$_->{'value'};
} elsif($_->{'name'} eq 'month'){
$month=$_->{'value'};
} elsif($_->{'name'} eq 'archiveurl'){
$archived=1;
}
$ret.='|'.$_->{'text'};
}
my $r404='';
if(!$archived && $url=~m!^$proto1$linkre!){
my ($u,$dt);
$dt=$accessdate // $date // str2time("1 $month $year") // str2time("1 June $year") // time();
($u,$dt,$r404)=chkExtLink($self,$api,0,$url, $dt);
return undef if($self->{'flags'}{'fail'});
$ret.="|archiveurl=$u|archivedate=$dt" unless $r404;
if(!$r404){
$ret=~s/$rmdl//g;
$r404=$rmdl;
}
}
$ret.="}}".$r404;
return $ret;
});
return 60 if($self->{'flags'}{'fail'});
# Next, strip for raw link processing
# Regular expressions are adapted from those MediaWiki uses to
# recognize external links.
($outtxt,$nowiki)=$api->strip_nowiki($outtxt);
($outtxt,$nowiki)=$api->strip_templates($outtxt, sub {
my $name=shift;
return exists($templates{"Template:$name"});
}, {}, $nowiki);
# Strip out ref tags, then replace any links with a guess at access
# date.
($outtxt,$nowiki)=$api->strip_regex(qr!<ref[ >].*?</ref>!, $outtxt, $nowiki);
my @arc=qw/[aA]rchive webcitation\.org [wW]ayback/;
my $arc='(?:'.join('|',@arc).')';
while(my ($k,$v)=each %$nowiki){
next unless $v=~/^<ref/;
next if $v=~/$arc/;
my ($dt,$nw);
# We have to re-strip here, because the saved values here are
# automatically unstripped.
($v,$nw)=$api->strip_nowiki($v);
($v,$nw)=$api->strip_templates($v, sub {
my $name=shift;
return exists($templates{"Template:$name"});
}, {}, $nw);
$dt=str2time($1) if $v=~/(?:accessed|retrieved)(?: +on)? +(\d{4}-\d{2}-\d{2}|\d+ \w+,? \d{4}|\w+ \d+,? \d{4})/i;
$v=~s{\[($proto1$linkre$chars*)($btext)\]}{ chkExtLink($self,$api,1,$1,$dt // time(),$2) }ge;
return 60 if($self->{'flags'}{'fail'});
($v,$nw)=$api->strip_regex(qr{\[$proto1$chars+$btext\]}, $v, $nw);
$v=~s{\b($proto2$linkre$chars*)}{ chkExtLink($self,$api,2,$1,$dt // time()) }ge;
return 60 if($self->{'flags'}{'fail'});
$v=$api->replace_stripped($v,$nw);
$nowiki->{$k}=$v;
}
# Fix any bracketed external link that doesn't have "Archive" or the
# like in the line after it.
$outtxt=~s{\[($proto1$linkre$chars*)($btext)\](?!.*$arc)}{ chkExtLink($self,$api,1,$1,time(),$2) }ge;
return 60 if($self->{'flags'}{'fail'});
# Hide all bracketed external links. We have to keep track of the
# replacement token for the ones that have "Archive" etc in their
# display text.
($outtxt,$nowiki)=$api->strip_regex(qr{\[$proto1$chars+$btext\]}, $outtxt, $nowiki);
while(my ($k,$v)=each %$nowiki){
push @arc, $k if $v=~m!^\[$proto1$chars+ *.*$arc!;
}
$arc='(?:'.join('|',@arc).')';
# Fix any bare external link that doesn't have "Archive" or the like in
# the line after it.
$outtxt=~s{\b($proto2$linkre$chars+)(?!.*$arc)}{ chkExtLink($self,$api,2,$1,time()) }ge;
return 60 if($self->{'flags'}{'fail'});
# Unstrip
$outtxt=$api->replace_stripped($outtxt,$nowiki);
# Remove "no-dead-link" markers
$outtxt=~s/\Q$nodl\E//g;
# rm marked {{dead link}} templates (and $rmdl markers)
$outtxt=~s/\Q$rmdl\E(?:\s*$dlre)*//g;
# rm duplicate {{dead link}} templates too
$outtxt=~s/$dlre+($dlre)/$1/g;
if($outtxt ne $intxt){
my @summary=();
push @summary, "adding archiveurl for archived $desc cites" if $self->{'flags'}{'cite'};
push @summary, "changing archived $desc links" if $self->{'flags'}{'link'};
push @summary, "tagging dead $desc links" if($self->{'flags'}{'404'} && defined($deadlink));
unless(@summary){
$api->warn("Changes made with no summary for $page, not editing");
next;
}
$summary[$#summary]='and '.$summary[$#summary] if @summary>1;
my $summary=ucfirst(join((@summary>2)?', ':' ', @summary));
$summary.=" per $req";
$api->log("$summary in $page");
my $r=$api->edit($tok, $outtxt, "$summary. $screwup", 1, 1);
if($r->{'code'} ne 'success'){
$api->warn("Write failed on $page: ".$r->{'error'}."\n");
next;
}
}
# If we've been at it long enough, let another task have a go.
return 0 if time()>=$endtime;
}
$self->{'iter'}=undef;
}
$api->log("May be DONE!");
return 600;
}
sub chkExtLink {
my $self=shift;
if($self->{'flags'}{'fail'}){
return wantarray?('fail','fail','fail'):'fail';
}
my $api=shift;
my $fmt=shift;
my $url=shift;
my $date=shift;
my $txt='';
if($fmt==2){
# Duplicate Mediawiki post-processing of bare external links
$txt=$1.$txt if $url=~s/((?:[<>]|&[lg]t;).*$)//;
my $sep=',;\.:!?';
$sep.=')' unless $url=~/\(/;
$txt=$1.$txt if $url=~s/([$sep]+$)//;
# There shouldn't be a template inside the url
$txt=$1.$txt if $url=~s/(\{\{.*$)//;
return $url.$txt unless $url=~m!^$proto2$linkre!;
}
# Get archive link and date
my @archives;
my ($u, $dt);
if(exists($api->store->{$url})){
@archives=@{$api->store->{$url}};
} else {
($u="http://web.archive.org/web/*/$url")=~s!/$proto1!/!;
$api->log("... Checking $u");
# Screen-scrape archive.org
my $r=$self->{'ua'}->get($u);
if($r->is_success){
foreach $_ ($r->decoded_content=~m!href="(http://web.archive.org/web/\d+/[^\x22]*)"!g) {
$_ = HTML::Entities::decode($_);
$api->log("... Got $_");
if(m!^http://web.archive.org/web/(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})!){
$dt=timegm($6,$5,$4,$3,$2-1,$1-1900);
} else {
$dt=time();
}
push @archives, [$dt, $_];
}
} elsif($r->code eq '404'){
$api->log("... Failed with ".$r->code);
} elsif($r->code eq '403' && $r->decoded_content=~m!<p class="mainTitle">Blocked Site Error.<br><br>\s*</p>\s*<p class="mainBigBody"><i>\Q$url\E</i> is not available in the Wayback Machine!){
$api->log("... Failed with 403 'not available in the Wayback Machine'");
} else {
$api->log("... Failed with ".$r->code.", will retry later");
$self->{'flags'}{'fail'}=1;
return chkExtLink($self);
}
# check webcite too
$u=URI->new('http://www.webcitation.org/query');
$u->query_form(url=>$url,returnxml=>1);
$u=$u->as_string;
$api->log("... Checking $u");
$r=$self->{'ua'}->get($u);
if($r->is_success){
my $xml=$self->{'xml'}->load_xml(string=>$r->decoded_content);
if($xml){
foreach $_ (@{$xml->find('//result[@status=\'success\']')}){
$dt=$_->find('./timestamp');
my $uu=URI->new('http://www.webcitation.org/query');
$uu->query_form(url=>$url,date=>$dt);
$uu=$uu->as_string;
# Not exactly RFC-compliant, but it works fine
$uu=~s/%3A/:/g; $uu=~s/%2F/\//g;
$api->log("... Got $uu");
push @archives, [str2time($dt) // time(), $uu];
}
} else {
$api->log("... Invalid XML data");
$self->{'flags'}{'fail'}=1;
return chkExtLink($self);
}
} elsif($r->code eq '404'){
$api->log("... Failed with ".$r->code);
} else {
$api->log("... Failed with ".$r->code.", will retry later");
$self->{'flags'}{'fail'}=1;
return chkExtLink($self);
}
$api->store->{$url}=\@archives;
}
# Then pull the closest archive to the accessdate or whatever.
my ($diff,$r404)=(1e100,defined($deadlink)?"{{$deadlink|date=".strftime('%B %Y', gmtime)."|bot=".$api->user."}}":$nodl);
$u=undef;
foreach $_ (@archives){
if(abs($_->[0] - $date) < $diff){
$diff=abs($_->[0] - $date);
($dt,$u)=@$_;
$r404='';
}
}
if($r404){
$self->{'flags'}{'404'}=1;
} elsif($fmt==0){
$self->{'flags'}{'cite'}=1;
} else {
$self->{'flags'}{'link'}=1;
}
if($fmt==0){ # cite template
return ($u,strftime('%Y-%m-%d',gmtime($dt // 0)),$r404);
} elsif($fmt==1){ # Bracketed external link
my $txt=shift;
return $r404?"[$url$txt]$r404":"[$u$txt]$rmdl";
} elsif($fmt==2){ # Bare external link
return ($r404?"[$url $url]$r404":"$u$rmdl").$txt.$rmdl;
} else {
return undef;
}
}
1;