User:HighInBC/MCP/RecoverTalkArchive

package HBCPlugins::RecoverTalkArchive;
use     strict;
use     Algorithm::Diff qw(diff);
use     HTTP::Request;
use     HTTP::Request::Common;
use     LWP::UserAgent;
use     XML::Simple;
use     URI::Escape;
use     Data::Dumper;
our     $self;
our     $nowiki = ('nowiki');

sub new {
  shift;
  $self = shift;
  bless($self);

  register_method();
  $self->{ua} = new LWP::UserAgent;
  $self->{ua}->default_headers->push_header('Accept-Encoding' => 'gzip');
  $self->{ua}->cookie_jar({});
  &{$self->{shared}{add_job}}(\&login,0);

  return $self;
}

sub login {
  open(USER,'adminCredentials/username');sysread(USER, my $admin_username, -s(USER));close(USER);
  open(PASS,'adminCredentials/password');sysread(PASS, my $admin_password, -s(PASS));close(PASS);
  die $@ unless ($admin_username && $admin_password);
  print "Fetching account cookies...\n";
  my $xml = $self->{ua}->request(POST 'http://en.wikipedia.org/w/api.php?action=login&format=xml', [lgname => $admin_username, lgpassword => $admin_password])->decoded_content();
  eval{$xml = XMLin($xml)};
  (warn $@ && return) if ($@);
  $self->{settings}{loggedIn} = (${$xml}{login}{result} eq 'Success');
  print (($self->{settings}{loggedIn}) ? ("Logged in.\n") : ("Log in failed.\n"));
}

sub register_method {
  $self->{CommandParser} = $self->{shared}{$self->{params}{command_parser_label}} || die;
  $self->{CommandParser}->methodHandler('RecoverTalkArchive', \&RecoverTalkArchive);
}

sub RecoverTalkArchive {
  my $rh_extras = shift;
  my $ra_settings = shift;
  $self->{settings} = parse_settings($ra_settings);
  $self->{mw} = ${$rh_extras}{'mw'};
  
  $self->{result} = '';
  &{$self->{shared}{add_job}}(\&scan_talk_page,0);
}

sub parse_settings {
  my $ra_settings = shift;
  my %result;
  foreach my $ra_setting (@{$ra_settings}) {
    $result{${$ra_setting}[0]} = ${$ra_setting}[1];
  }
  return \%result;
}

sub scan_talk_page {
  my $batch_size = 500;
  my $url = 'http://en.wikipedia.org/w/api.php?action=query&format=xml&prop=revisions&rvdir=newer&rvlimit=<BATCHSIZE>&rvprop=timestamp|user|comment|content|ids&titles=<TITLE><RVSTART>'; #
  my $target = uri_escape($self->{settings}{target});
  my $rvstart = (($self->{settings}{start_pos}) ? ("&rvstartid=$self->{settings}{start_pos}"): (''));
  $url =~ s|<RVSTART>|$rvstart|;
  $url =~ s|<TITLE>|$target|;
  $url =~ s|<BATCHSIZE>|$batch_size|;
  print "Fetching $batch_size revisions for: $self->{settings}{target}\n";
  my $xml = $self->{ua}->get($url)->decoded_content();
  eval {$xml = XMLin($xml);};
  if ($@) {
    die "Failed to parse XML: $@\n$xml\n";
  }
  my $ra_revisions = ${$xml}{'query'}{'pages'}{'page'}{'revisions'}{'rev'};
  $ra_revisions = [$ra_revisions] unless (ref($ra_revisions) eq 'ARRAY');
  print "Got ".scalar(@{$ra_revisions})." revisions...\n";
  $self->{settings}{totalRevisions} += scalar(@{$ra_revisions});
  $self->{settings}{headings} ||= [];
  my $prev_id;
  my $last_timestamp;
  foreach my $rh_revision (@{$ra_revisions}) {
    my @lines = split("\n", ${$rh_revision}{'content'});
    if($self->{settings}{lastpage}) {
      my @diffs = diff($self->{settings}{lastpage}, \@lines);
      foreach my $ra_hunk (@diffs)
        {
        foreach my $ra_diff (@{$ra_hunk})
          {
          my($action,$content) = @{$ra_diff}[0,2];
          if (($content =~ m|^==+\s*([^=]*)\s*==+\s*$|) && ($action eq '-'))
            {
            my $heading = $1;
            $heading =~ s|</?$nowiki>||g;
            ($heading =~ s|(\{\{.*:.*\}\})|<$nowiki>$1</$nowiki>|) if ($heading =~ m|\{\{.*:.*\}\}|);
            ($heading =~ s|(\{\{.*\}\})|<$nowiki>$1</$nowiki>|) if ($heading =~ m|\{\{.*\}\}|);
            ($heading =~ s|image:|:Image:|i);
            ($heading =~ s|file:|:file:|i);
            my %result = (
              revid     => $prev_id,
              timestamp => ${$rh_revision}{'timestamp'},
              comment   => ${$rh_revision}{'comment'},
              user      => ${$rh_revision}{'user'},
              heading   => $heading,
            );
            push(@{$self->{settings}{headings}},\%result);
            }
          }
        }
    }
    my @old_lines = split("\n",${$rh_revision}{content});
    $self->{settings}{lastpage} = \@old_lines;
    $prev_id = ${$rh_revision}{'revid'};
    $last_timestamp = ${$rh_revision}{'timestamp'};
  }
  if (${$xml}{'query-continue'}{'revisions'}{'rvstartid'}) {
    $self->{settings}{start_pos} = ${$xml}{'query-continue'}{'revisions'}{'rvstartid'};
    print "Next batch starts at: $self->{settings}{start_pos} ($last_timestamp)\n";
    &{$self->{shared}{add_job}}(\&scan_talk_page,1);
  } else {
    push(@{$self->{settings}{headings}},{lastRevid => $prev_id, lastTimestamp => $last_timestamp});
    print "No more revisions left. Writing report.\n";
    &{$self->{shared}{add_job}}(\&make_report,1);
  }
  print "\n";
}

sub make_report {
  my $last_timestamp;
  my $last_rev_id;

  my $title = $self->{settings}{target};
  
  HEADING: foreach my $rh_heading (@{$self->{settings}{headings}}) {
    if (${$rh_heading}{lastRevid}) {
      $self->{result} .= "<!-- lastRevid ${$rh_heading}{lastRevid} lastTimestamp ${$rh_heading}{lastTimestamp} -->\n";
      last HEADING;
    }
    ${$rh_heading}{timestamp} =~ m|^(\d\d\d\d-\d\d-\d\d)|;
    my $printable_date = $1;
    if ($printable_date ne $last_timestamp) {
      $self->{result} .= "== $printable_date ==\n";
    }
    my $summary = ${$rh_heading}{comment};
    $summary =~ s|[{}]||g;$summary =~ s|\n||g;$summary ||= 'None';
    $summary =~ s|image:|:Image:|i;$summary =~ s|file:|:file:|i;
    $summary =~ s|__.*?__||g;
    
    if (${$rh_heading}{revid} != $last_rev_id) {
      $self->{result} .= ";'''[{{fullurl:$title|oldid=${$rh_heading}{revid}}} Archive link for ${$rh_heading}{timestamp}]''' [{{fullurl:$title|diff=next&oldid=${$rh_heading}{revid}}} (diff)]\n";
      $self->{result} .= ":* Removed by: ''[[User:${$rh_heading}{user}]]''\n";
      $self->{result} .= ":* Summary: \"''$summary''\"\n";
      $self->{result} .= ":* Sections removed:\n";
    }
    $self->{result} .= "::* '''${$rh_heading}{heading}'''\n";
    $self->{result} .="\n";
    $last_rev_id = ${$rh_heading}{revid};
    $last_timestamp = $printable_date;
  }
  print "Sending report to [[$self->{settings}->{'reportTo'}]]\n";
  my $page = $self->{mw}->get_page({title => $self->{settings}->{'reportTo'}});

  $self->{mw}->edit( {
    action        => 'edit',
    title         => $self->{settings}->{'reportTo'},
    basetimestamp => $page->{'timestamp'},
    text          => $self->{result},
    summary       => 'Posting talk page archive report.',
  }) || die $self->{mw}->{error}->{code} . ': ' . $self->{mw}->{error}->{details};
  print "Done.\n\n";
}

1;