User:AnomieBOT/source/tasks/SpamBlacklistBlocker.pm

package tasks::SpamBlacklistBlocker;

=pod

=begin metadata

Bot:      AnomieBOT III
Task:     SpamBlacklistBlocker
BRFA:     Wikipedia:Bots/Requests for approval/AnomieBOT III 3
Status:   Inactive 2023-07-17
Created:  2016-10-26
Exclusion: false

Block IPs that hit [[User:AnomieBOT III/Spambot URI list|certain URLs]] on the
spam blacklist too frequently.

=end metadata

=cut

use utf8;
use strict;

use AnomieBOT::Task;
use Data::Dumper;
use POSIX qw/strftime/;

use vars qw/@ISA/;
@ISA=qw/AnomieBOT::Task/;

my $botBlacklist = 'User:AnomieBOT III/Spambot URI list';
my $sleeptime = 60; # seconds
my $timeframe = 120; # seconds
my $hits = 2;
my $blocktime = '1 month';

sub new {
    my $class = shift;
    my $self = $class->SUPER::new();
    bless $self, $class;
    return $self;
}

=pod

=for info
Approved 2016-12-25<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT III 3]]

=for info
Deactivated 2023-07-17. On-wiki lists haven't been maintained so it hasn't blocked anything since 2019, and now WMF is changing the blacklist it watches. No point in updating the code if no one cares about the lists anymore.

=cut

sub approved {
    return -501;
}

sub run {
    my ($self, $api) = @_;

    $api->task('SpamBlacklistBlocker', 0, 10, qw/d::Timestamp d::Talk d::Nowiki/);

    my $start = time();

    my $reason = "{{spamblacklistblock}} <!-- Attempted to add a link blacklisted on [[$botBlacklist]] too frequently -->";
    my $blockText = "{{subst:spamblacklistblock|anon=yes|time=$blocktime|notalk=yes|sig=yes}}";

    my $re = $self->loadBlacklist( $api );
    return $sleeptime unless $re;

    my %hits = ();
    my $iter = $api->iterator(
        list => 'logevents',
        leprop => 'user|details',
        leaction => 'spamblacklist/hit',
        lestart => $api->timestamp2ISO( time() - $timeframe ),
        ledir => 'newer',
        lelimit => 'max',
    );
    while(my $le = $iter->next) {
        return 0 if $api->halting;
        if(!$le->{'_ok_'}) {
            $api->warn("Failed to retrieve log entries: " . $le->{'error'} . "\n");
            return $sleeptime;
        }

        next unless exists( $le->{'anon'} ) || exists( $le->{'temp'} );
        next unless exists( $le->{'params'}{'url'} );
        next unless $le->{'params'}{'url'}=~$re;
        $hits{$le->{'user'}} = [] unless exists( $hits{$le->{'user'}} );
        push @{$hits{$le->{'user'}}}, $le->{'params'}{'url'};
    }

    while ( my ($ip, $urls) = each %hits ) {
        return 0 if $api->halting;

        next if @$urls < $hits;
        my %urls = ();
        @urls{@$urls} = undef;

        $api->log( "Blocking $ip for hitting the blacklist " . scalar( @$urls ) . " times in $timeframe seconds (" . join( ', ', sort keys %urls ) . ")" );

        my $tok = $api->edittoken( "User talk:$ip", EditRedir => 1, NoExclusion => 1 );
        if($tok->{'code'} eq 'shutoff') {
            $api->warn("Task disabled: " . $tok->{'content'} . "\n");
            return 300;
        }
        if($tok->{'code'} ne 'success') {
            $api->warn("Failed to get token for $ip: " . $tok->{'error'} . "\n");
            next;
        }

        my $res = $api->action( $tok,
            action => 'block',
            user => $ip,
            expiry => $blocktime,
            reason => $reason,
            anononly => 1,
            nocreate => 1,
        );
        if ( $res->{'code'} eq 'alreadyblocked' ) {
            # IP is already blocked, so don't mess with it.
            next;
        }
        if ( $res->{'code'} ne 'success' ) {
            $api->warn("Failed to block $ip: " . $res->{'error'} . "\n");
            next;
        }

        $res = $api->edit($tok, $blockText, $reason, 0, 0, section => 'new', sectiontitle => strftime( '== %B %Y ==', gmtime ));
        if ( $res->{'code'} ne 'success' ) {
            $api->warn("Failed to post block notice on User talk:$ip: " . $res->{'error'} . "\n");
        }
    }

    my $sleep = $start + $sleeptime - time();
    return $sleep > 0 ? $sleep : 0;
}

sub loadBlacklist {
    my ($self, $api) = @_;

    my $whineTo = $botBlacklist;
    $whineTo =~ s/^User:/User talk:/;
    if ( $whineTo eq $botBlacklist ) { # WTF?
        $api->warn( "$botBlacklist must be a userspace page\n" );
        return undef;
    }

    my $res = $api->query(
        titles => $botBlacklist,
        prop => 'revisions|info',
        rvprop => 'content',
        rvslots => 'main',
        rvlimit => 1,
        inprop => 'protection',
        formatversion => 2,
    );
    if ( $res->{'code'} ne 'success' ) {
        $api->warn( "Failed to load $botBlacklist: " . $res->{'error'} . "\n" );
        return undef;
    }
    unless ( exists( $res->{'query'}{'pages'}[0] ) ) {
        $api->warn( "Failed to load $botBlacklist: Page node not found in response\n" );
        return undef;
    }
    my $page = $res->{'query'}{'pages'}[0];

    my $protected = 0;
    for my $p (@{$page->{'protection'} // []}) {
        $protected = 1 if $p->{'type'} eq 'edit' && $p->{'level'} eq 'sysop';
    }
    if ( !$protected ) {
        $api->whine( 'Page is unprotected', 'The blacklist page must be fully protected. I cannot run until that is done; be sure that the list is correct before protecting.', Pagename => $whineTo );
        return undef;
    }

    my ($dummy, $nowiki) = $api->strip_nowiki( $page->{'revisions'}[0]{'slots'}{'main'}{'content'} );
    my $list = undef;
    for my $block (values %$nowiki) {
        next unless $block=~/^<pre>\n(.*)\n<\/pre>$/s;
        if ( defined( $list ) ) {
            $api->whine( 'Page is broken', 'I look for the list inside {{tag|pre}}, but too many such blocks were found. I cannot run until that situation is corrected.', Pagename => $whineTo );
            return undef;
        }
        $list = $1;
    }
    if ( !defined( $list ) ) {
        $api->whine( 'Page is broken', 'I look for the list inside {{tag|pre}}, but no such tags were found. I cannot run until that situation is corrected.', Pagename => $whineTo );
        return undef;
    }

    my @regexes = ();
    for my $line (split /\r?\n/, $list){
        $line =~ s/#.*$//;
        $line =~ s/^\s+|\s+$//g;
        next if $line eq '';

        # Validate each line, in case someone screws up the blacklist page
        eval {
            qr/^(?:$line)$/im;
        };
        if ( $@ ) {
            $api->whine( 'Page contains a broken regex', "The line <code><nowiki>$line</nowiki></code> cannot be parsed as a regular expression. The error is <code><nowiki>$@</nowiki></code>. I cannot run until this situation is corrected.", Pagename => $whineTo );
            return undef;
        }

        push @regexes, $line;
    }

    return undef unless @regexes;

    my $re = join( '|', @regexes );
    eval {
        $re = qr!^(?:https?:)?//+[a-z0-9_\-.]*(?:$re)!im;
    };
    if ( $@ ) {
        $api->whine( 'Regex list as a whole is broken', "When combined into a final regex, an error was returned. The error is <code><nowiki>$@</nowiki></code>. I cannot run until this situation is corrected.", Pagename => $whineTo );
        return undef;
    }

    return $re;
}

1;