package HBCPlugins::RecoverTalkArchive; use strict; use Algorithm::Diff qw(diff); use HTTP::Request; use HTTP::Request::Common; use LWP::UserAgent; use XML::Simple; use URI::Escape; use Data::Dumper; our $self; our $nowiki = ('nowiki'); sub new { shift; $self = shift; bless($self); register_method(); $self->{ua} = new LWP::UserAgent; $self->{ua}->default_headers->push_header('Accept-Encoding' => 'gzip'); $self->{ua}->cookie_jar({}); &{$self->{shared}{add_job}}(\&login,0); return $self; } sub login { open(USER,'adminCredentials/username');sysread(USER, my $admin_username, -s(USER));close(USER); open(PASS,'adminCredentials/password');sysread(PASS, my $admin_password, -s(PASS));close(PASS); die $@ unless ($admin_username && $admin_password); print "Fetching account cookies...\n"; my $xml = $self->{ua}->request(POST 'http://en.wikipedia.org/w/api.php?action=login&format=xml', [lgname => $admin_username, lgpassword => $admin_password])->decoded_content(); eval{$xml = XMLin($xml)}; (warn $@ && return) if ($@); $self->{settings}{loggedIn} = (${$xml}{login}{result} eq 'Success'); print (($self->{settings}{loggedIn}) ? ("Logged in.\n") : ("Log in failed.\n")); } sub register_method { $self->{CommandParser} = $self->{shared}{$self->{params}{command_parser_label}} || die; $self->{CommandParser}->methodHandler('RecoverTalkArchive', \&RecoverTalkArchive); } sub RecoverTalkArchive { my $rh_extras = shift; my $ra_settings = shift; $self->{settings} = parse_settings($ra_settings); $self->{mw} = ${$rh_extras}{'mw'}; $self->{result} = ''; &{$self->{shared}{add_job}}(\&scan_talk_page,0); } sub parse_settings { my $ra_settings = shift; my %result; foreach my $ra_setting (@{$ra_settings}) { $result{${$ra_setting}[0]} = ${$ra_setting}[1]; } return \%result; } sub scan_talk_page { my $batch_size = 500; my $url = 'http://en.wikipedia.org/w/api.php?action=query&format=xml&prop=revisions&rvdir=newer&rvlimit=<BATCHSIZE>&rvprop=timestamp|user|comment|content|ids&titles=<TITLE><RVSTART>'; # my $target = uri_escape($self->{settings}{target}); my $rvstart = (($self->{settings}{start_pos}) ? ("&rvstartid=$self->{settings}{start_pos}"): ('')); $url =~ s|<RVSTART>|$rvstart|; $url =~ s|<TITLE>|$target|; $url =~ s|<BATCHSIZE>|$batch_size|; print "Fetching $batch_size revisions for: $self->{settings}{target}\n"; my $xml = $self->{ua}->get($url)->decoded_content(); eval {$xml = XMLin($xml);}; if ($@) { die "Failed to parse XML: $@\n$xml\n"; } my $ra_revisions = ${$xml}{'query'}{'pages'}{'page'}{'revisions'}{'rev'}; $ra_revisions = [$ra_revisions] unless (ref($ra_revisions) eq 'ARRAY'); print "Got ".scalar(@{$ra_revisions})." revisions...\n"; $self->{settings}{totalRevisions} += scalar(@{$ra_revisions}); $self->{settings}{headings} ||= []; my $prev_id; my $last_timestamp; foreach my $rh_revision (@{$ra_revisions}) { my @lines = split("\n", ${$rh_revision}{'content'}); if($self->{settings}{lastpage}) { my @diffs = diff($self->{settings}{lastpage}, \@lines); foreach my $ra_hunk (@diffs) { foreach my $ra_diff (@{$ra_hunk}) { my($action,$content) = @{$ra_diff}[0,2]; if (($content =~ m|^==+\s*([^=]*)\s*==+\s*$|) && ($action eq '-')) { my $heading = $1; $heading =~ s|</?$nowiki>||g; ($heading =~ s|(\{\{.*:.*\}\})|<$nowiki>$1</$nowiki>|) if ($heading =~ m|\{\{.*:.*\}\}|); ($heading =~ s|(\{\{.*\}\})|<$nowiki>$1</$nowiki>|) if ($heading =~ m|\{\{.*\}\}|); ($heading =~ s|image:|:Image:|i); ($heading =~ s|file:|:file:|i); my %result = ( revid => $prev_id, timestamp => ${$rh_revision}{'timestamp'}, comment => ${$rh_revision}{'comment'}, user => ${$rh_revision}{'user'}, heading => $heading, ); push(@{$self->{settings}{headings}},\%result); } } } } my @old_lines = split("\n",${$rh_revision}{content}); $self->{settings}{lastpage} = \@old_lines; $prev_id = ${$rh_revision}{'revid'}; $last_timestamp = ${$rh_revision}{'timestamp'}; } if (${$xml}{'query-continue'}{'revisions'}{'rvstartid'}) { $self->{settings}{start_pos} = ${$xml}{'query-continue'}{'revisions'}{'rvstartid'}; print "Next batch starts at: $self->{settings}{start_pos} ($last_timestamp)\n"; &{$self->{shared}{add_job}}(\&scan_talk_page,1); } else { push(@{$self->{settings}{headings}},{lastRevid => $prev_id, lastTimestamp => $last_timestamp}); print "No more revisions left. Writing report.\n"; &{$self->{shared}{add_job}}(\&make_report,1); } print "\n"; } sub make_report { my $last_timestamp; my $last_rev_id; my $title = $self->{settings}{target}; HEADING: foreach my $rh_heading (@{$self->{settings}{headings}}) { if (${$rh_heading}{lastRevid}) { $self->{result} .= "<!-- lastRevid ${$rh_heading}{lastRevid} lastTimestamp ${$rh_heading}{lastTimestamp} -->\n"; last HEADING; } ${$rh_heading}{timestamp} =~ m|^(\d\d\d\d-\d\d-\d\d)|; my $printable_date = $1; if ($printable_date ne $last_timestamp) { $self->{result} .= "== $printable_date ==\n"; } my $summary = ${$rh_heading}{comment}; $summary =~ s|[{}]||g;$summary =~ s|\n||g;$summary ||= 'None'; $summary =~ s|image:|:Image:|i;$summary =~ s|file:|:file:|i; $summary =~ s|__.*?__||g; if (${$rh_heading}{revid} != $last_rev_id) { $self->{result} .= ";'''[{{fullurl:$title|oldid=${$rh_heading}{revid}}} Archive link for ${$rh_heading}{timestamp}]''' [{{fullurl:$title|diff=next&oldid=${$rh_heading}{revid}}} (diff)]\n"; $self->{result} .= ":* Removed by: ''[[User:${$rh_heading}{user}]]''\n"; $self->{result} .= ":* Summary: \"''$summary''\"\n"; $self->{result} .= ":* Sections removed:\n"; } $self->{result} .= "::* '''${$rh_heading}{heading}'''\n"; $self->{result} .="\n"; $last_rev_id = ${$rh_heading}{revid}; $last_timestamp = $printable_date; } print "Sending report to [[$self->{settings}->{'reportTo'}]]\n"; my $page = $self->{mw}->get_page({title => $self->{settings}->{'reportTo'}}); $self->{mw}->edit( { action => 'edit', title => $self->{settings}->{'reportTo'}, basetimestamp => $page->{'timestamp'}, text => $self->{result}, summary => 'Posting talk page archive report.', }) || die $self->{mw}->{error}->{code} . ': ' . $self->{mw}->{error}->{details}; print "Done.\n\n"; } 1;