User:HighInBC/MCP/NameWatcher

This is a plugin, not a stand alone program. See here[1] for GFDL history prior to 02:54, June 4, 2007.

package HBCPlugins::NameWatcher;
use     strict;
use     Data::Dumper;
use     MediaWiki;
use     URI::Escape;
our     $self;

sub new
  {
  shift;
  $self = shift;
  bless($self);

  $self->{params}{version_number} = '1.0.2';
  $self->{params}{VERSION} = 'HBC NameWatcherBot v'.$self->{params}{version_number}; 
  warn $self->{params}{VERSION}." starting up.\n";
  $self->{params}{parameter_expected} =
   {
    'Status'                    => 1,
    'Default target'            => 1,
    'Long name'                 => 1,
    'Repeating character'       => 1,
    'Write rate'                => 1,
    'Recheck duration'          => 1,
    'Recheck interval'          => 1,
   };
  $self->{params}{parameter_required} =
   [
    'Status',
    'Default target',
   ];
  $self->{params}{default_params} =
   {
    'Write rate'                => 10,
    'Long name'         => 0,
    'Repeating character'       => 0,
   };

  $self->{params}{whitelist} = [];
  $self->{params}{blacklist} = [];
  $self->{params}{delayed_names} = {};
  
  &{$self->{shared}{add_job}}(\&contact_irc_plugin , 0);
  &{$self->{shared}{add_job}}(\&login,0);
  &{$self->{shared}{add_job}}(\&check_homoglyphs,2);
  &{$self->{shared}{add_job}}(\&check_parameters,2);
  &{$self->{shared}{add_job}}(\&check_whitelist,2);
  &{$self->{shared}{add_job}}(\&check_blacklist,2);
  &{$self->{shared}{add_job}}(\&debug_report,5);

  return $self;
  }

sub check_version {
  my ($active_version,$my_version) = @_;
  my @active_parts = split(/\./, $active_version);
  my @my_parts = split(/\./, $my_version);

  return 0 if scalar(@active_parts) > scalar(@my_parts); # should never happen

  foreach (@active_parts) {
    my $check_part = shift(@my_parts);
    last if $check_part > $_;
    next if $_ <= $check_part;
    return 0;
  }
  return 1;
}
                                  

sub send_info {
  my $type = shift;
  if ($self->{params}{send_info_to}) {
    my %payloads = (
     'client'    => $self->{WP_obj},
     'panel'     => $self->{params}{param},
     'blacklist' => $self->{params}{blacklist},
     'whitelist' => $self->{params}{whitelist}
    );
    foreach my $target (@{$self->{params}{send_info_to}}) {
      warn "Sending data to $target: $type\n";
      $self->{shared}{$target}->set_info($type, $payloads{$type});
    }
  }
}

sub login
  {
  warn "Connecting to Wikipedia...\n";
  my $c                  =   MediaWiki->new;
  $c->setup
                          ({
                            'bot' => {'user' => $self->{params}{username},'pass' => $self->{params}{password}},
                            'wiki' => {'host' => 'en.wikipedia.org','path' => 'w'}
                          }) || warn "Failed to log in\n";
  my $whoami              =  $c->user();
  warn "$whoami connected\n";
  $self->{WP_obj} = $c;
  send_info('client');
  &{$self->{shared}{add_job}}(\&login,3600);
  }

sub debug_report
  {
  warn Dumper
   ({
#    'Whitelist'        => $self->{params}{whitelist},
#    'Blacklist'        => $self->{params}{blacklist},
    'Params'    => $self->{params}{param},
    'Delayed'   => $self->{params}{delayed_names},
   });
  &{$self->{shared}{add_job}}([\&debug_report],300);
  }

sub contact_irc_plugin
  {
  $self->{IRCFeed} = $self->{shared}{$self->{params}{irc_label}} || die;
  $self->{IRCFeed}->add_hook
   ({
     'check'    => sub
        {
        my $text = shift;
        my $pattern = chr(0x03).'03(.*?)'.chr(0x03);
        my $createpattern = chr(0x03).'4\s*create'.chr(0x03);
        if (($text =~ m|$createpattern|) && ($text =~ m"$pattern"))
          {
          return $1;
          }
        else
          {
          return 0;
          }
        },
     'callback' => \&check_name,
    }
   );
  $self->{IRCFeed}->add_hook
   ({
     'check'    => sub {$_[0] =~ m|User:HBC NameWatcherBot/Control panel|},
     'callback' => \&check_parameters,
    }
   );
  $self->{IRCFeed}->add_hook
   ({
     'check'    => sub {$_[0] =~ m|User:HBC NameWatcherBot/Whitelist|},
     'callback' => \&check_whitelist,
    }
   );
  $self->{IRCFeed}->add_hook
   ({
     'check'    => sub {$_[0] =~ m|User:HBC NameWatcherBot/Blacklist|},
     'callback' => \&check_blacklist,
    }
   );
  return;
  }

sub check_parameters
  {
  warn "Checking parameters...\n";
  my $page = $self->{WP_obj}->get('User:HBC NameWatcherBot/Control panel','r')->content();
  %{$self->{params}{param}} = ();
  foreach my $line (split("\n",$page))
    {
    if ($line =~ m|^;(.*?):(.*)$|)
      {
      $self->{params}{param}{$1} = $2 if ($self->{params}{parameter_expected}{$1});
      }
    }
  die "Bot is not turned on in the control panel\n" unless (lc($self->{params}{param}{'Status'}) eq 'on');
  foreach my $key (@{$self->{params}{parameter_required}})
    {
    die "Missing required parameter: $key\n" unless ($self->{params}{param}{$key});
    }
  %{$self->{params}{param}} = (%{$self->{params}{default_params}},%{$self->{params}{param}});
  send_info('panel');
  }

sub check_homoglyphs {
  warn "Checking homoglyphs...\n";
  my $page = $self->{WP_obj}->get('User:HBC_NameWatcherBot/Homoglyphs','r')->content();
  utf8::upgrade($page);
  @{$self->{params}{homoglyphs}} = ();
  foreach my $line (split("\n",$page)) {
    next unless ($line =~ m|^;(.+?):\s*(.+)$|);
    my $symbol = $1;
    my @matches = split(/\s+/,$2);
    for (@matches) {$_ = quotemeta($_)};
    push (@{$self->{params}{homoglyphs}}, $symbol, \@matches)
  }
}

sub check_whitelist
  {
  warn "Checking whitelist...\n";
  my $page = $self->{WP_obj}->get('User:HBC NameWatcherBot/Whitelist','r')->content();
  @{$self->{params}{whitelist}} = ();
  foreach my $line (split("\n",$page))
    {
    push(@{$self->{params}{whitelist}},$1) if ($line =~ m|^;(.*)$|);
    }
  send_info('whitelist');
  }

sub check_blacklist
  {
  warn "Checking blacklist...\n";
  my $page = $self->{WP_obj}->get('User:HBC NameWatcherBot/Blacklist','r')->content();
  @{$self->{params}{blacklist}} = ();
  foreach my $line (split("\n",$page))
    {
    if ($line =~ m|^;(.*?)(:(.*))?$|)
      {
      my $rh_entry = {};
      ${$rh_entry}{word} = $1;
      if ($2)
        {
        my @flags = split(',',$3);
        ${$rh_entry}{flags} = parse_flags(\@flags);
        }
      if (${$rh_entry}{'flags'}{'HOMOGLYPH'}) {
        my $hg_regex = compile_homoglyph_pattern(${$rh_entry}{'word'});
        ${$rh_entry}{'flags'}{'LABEL'}[0] = ('Homoglyph of: "'.${$rh_entry}{'word'}.'"');
        ${$rh_entry}{'flags'}{'REGEX'} = 1;
        ${$rh_entry}{'word'} = $hg_regex;
      }
      push(@{$self->{params}{blacklist}},$rh_entry);
      }
    }
  send_info('blacklist');
  }

sub parse_flags
  {
  my $ra_flags = shift;
  my %has_flag;
  foreach my $flag (@{$ra_flags})
    {
    if ($flag =~ m|(.*)\((.*)\)|)
      {
      my (@params) = split('\|',$2);
      $has_flag{$1} = \@params;
      }
    else
      {
      $has_flag{$flag} = 1
      }
    }
  return \%has_flag;
  }

sub check_name
  {
  shift;
  my($name) = shift;
  my $display_name = "'$name'";
  my $master_name = $name;
  warn "Checking name: $display_name\n";
  foreach my $white_word (@{$self->{params}{whitelist}}) {$name =~ s|$white_word||gi;} # Remove occurances of whitelisted words
  my $ra_offenses = [];
  foreach my $rh_black_word (@{$self->{params}{blacklist}})
    {
    my $word = ${$rh_black_word}{word};
    if (${$rh_black_word}{flags}{'REGEX'})
      {
      if (${$rh_black_word}{flags}{'CASE_SENSITIVE'})
        {
        push(@{$ra_offenses},$rh_black_word) if ($name =~ m|$word|);
        }
      else
        {
        push(@{$ra_offenses},$rh_black_word) if ($name =~ m|$word|i);
        }
      }
    else
      {
      if (${$rh_black_word}{flags}{'CASE_SENSITIVE'})
        {
        push(@{$ra_offenses},$rh_black_word) if (index($name,$word) > -1);
        }
      else
        {
        push(@{$ra_offenses},$rh_black_word) if (index(lc($name),lc($word)) > -1);
        }
      }
    }
  &{$self->{shared}{add_job}}([\&bad_name,$master_name,$ra_offenses],0) if
        (
         scalar(@{$ra_offenses}) ||
         (($self->{params}{param}{'Long name'}) && (length($name) > $self->{params}{param}{'Long name'})) ||
         (($self->{params}{param}{'Repeating characters'}) && (detect_repeatition($name,$self->{params}{param}{'Repeating characters'}))) ||
         (($self->{params}{param}{'Repeating numbers'}) && (detect_repeatition_numbers($name,$self->{params}{param}{'Repeating numbers'})))
        );
  return 1;
  }

sub bad_name
  {
  my ($name, $ra_offenses) = @_;
  my(@offense_reports) = ('* {{user-uaa|1='.$name.'}}');
  my $need_rechecking = 0; # default no reason to report
  my $need_header = 0;
  my $has_edited = user_has_edited($name);
  my $is_blocked = is_blocked($name);
  my $target_page = $self->{params}{param}{'Default target'};
  push(@offense_reports,':* This user has edited at least one time.') if ($has_edited);
  if (($self->{params}{param}{'Long name'}) && (length($name) > $self->{params}{param}{'Long name'}))
    {
    push(@offense_reports, ":* At ".length($name)." characters, this name exceeds ".$self->{params}{param}{'Long name'}." characters");
    }
  if (($self->{params}{param}{'Repeating characters'}) && (my $char = detect_repeatition($name,$self->{params}{param}{'Repeating characters'})))
    {
    push(@offense_reports, ":* This name has the character \"'''".$char."'''\" repeating ".$self->{params}{param}{'Repeating characters'}." or more than times in a row.");
    }
  if (($self->{params}{param}{'Repeating numbers'}) && (detect_repeatition_numbers($name,$self->{params}{param}{'Repeating numbers'})))
    {
    push(@offense_reports, ":* This name has ".$self->{params}{param}{'Repeating numbers'}." or more numbers in a row.");
    }
  foreach my $rh_blackword (@{$ra_offenses})
    {
    my $rh_has_flag = ${$rh_blackword}{flags};
    my $label = ((${$rh_has_flag}{'LABEL'}) ? ("called '''".${${$rh_has_flag}{'LABEL'}}[0]."'''") : ("'''${$rh_blackword}{word}'''"));
    my $name_pattern = '('.${$rh_blackword}{word}.')';
    my $match;
    if (${$rh_has_flag}{'CASE_SENSITIVE'})
      {
      $name =~ m|$name_pattern|;
      $match = " The portion that matched was '''$1'''." if ($1);
      }
    else
      {
      $name =~ m|$name_pattern|i;
      $match = " The portion that matched was '''$1'''." if ($1);
      }
    my(@report) = (((${$rh_has_flag}{'REGEX'}) ? (":* Matches the regular expression $label.$match") : (":* Matches the literal pattern $label.")));
    if (${$rh_has_flag}{'CASE_SENSITIVE'}) {
      push (@report, "::* This filter is case sensitive.");
    }
    if (${$rh_has_flag}{'WAIT_TILL_EDIT'})
      {
      if ($has_edited)
        {
        push (@report, "::* This report was delayed until the user edited.");
        delete(${$self->{params}{param}{delayed_names}}{$name});
        }
      else
        {
        warn "\t$name has not edited, not counting match to the string $label at this point, rechecking in ".$self->{params}{param}{'Recheck interval'}." seconds.\n";
        $need_rechecking = 1 unless (scalar(@offense_reports) > 1);
        next;
        }
      }
    if (${$rh_has_flag}{'HOMOGLYPH'}) {
      push (@report, "::* This filter attempts to see past the use of similar looking characters and creative substitutions, see [[Homoglyphs]].");
    }
    $need_rechecking = 0;
    warn Dumper
     ({
      'Name'    => $name,
      'Flags'   => $rh_has_flag,
      'Word'    => ${$rh_blackword}{'word'},
     });
    if  (${$rh_has_flag}{'HTTP_CHECK'})
      {
      warn "Gotta check '$name' for http server.\n";
      my $code = $self->{WP_obj}->{ua}->head('http://'.$name)->code();
      if (($code == 500) && (ref(${$rh_has_flag}{'HTTP_CHECK'})) && (lc(${${$rh_has_flag}{'HTTP_CHECK'}}[0]) eq 'ignore_fail'))
        {
        warn "Ignoring failed connection.\n";
        return;
        }
      my $info = '';
      if        ($code =~ m|^2..$|)
        {
        $info = "A response code starting in 2 means '''HTTP server found, succesful request'''.";
        }
      elsif     ($code =~ m|^3..$|)
        {
        $info = "A response code starting in 3 means '''HTTP server found, client redirected'''.";
        }
      elsif     ($code =~ m|^4..$|)
        {
        $info = "A response code starting in 4 means '''HTTP server found, HTTP server gave error'''.";
        }
      elsif     ($code =~ m|^5..$|)
        {
        $info = "A response code starting in 5 means '''HTTP server not found'''.";
        }
      push (@report, "::* When using the name '''$name([http://$name link])''' as a http URL I got a response code of '''$code'''.".$info);
      }
    if  (${$rh_has_flag}{'ALTERNATE_TARGET'})
      {
      $target_page = ${${$rh_has_flag}{'ALTERNATE_TARGET'}}[0];
      warn "Alternate target: $target_page\n";
      }
    if  (${$rh_has_flag}{'USE_HEADER'})
      {
      $need_header = 1;
      }
    if  (${$rh_has_flag}{'LOW_CONFIDENCE'})
      {
      push (@report, "::* The string $label is prone to false positives, please take extra care ensuring this name is actually a violation before blocking.");
      }
    if  (${$rh_has_flag}{'NOTE'})
      {
      push (@report, "::* The string $label has a comment associated with it: ".${${$rh_has_flag}{'NOTE'}}[0]);
      }
    if  (${$rh_has_flag}{'SOCK_PUPPET'})
      {
      push (@report, "::* The string $label is often associate with sock puppets of [[User:".${${$rh_has_flag}{'SOCK_PUPPET'}}[0]."]] {{#ifexist:Category:Suspected Wikipedia sockpuppets of ".${${$rh_has_flag}{'SOCK_PUPPET'}}[0]."|(see [[:Category:Suspected Wikipedia sockpuppets of ".${${$rh_has_flag}{'SOCK_PUPPET'}}[0]."]])}}");
      }
    push (@offense_reports,@report);
    }
  if ($need_rechecking)
    {
    ${$self->{params}{param}{delayed_names}}{$name} ||= time();
    my $report_age = (time() - ${$self->{params}{param}{delayed_names}}{$name});
    if (($report_age > $self->{params}{param}{'Recheck duration'}) || ($is_blocked))
      {
      warn "Forgetting about $name, it has been $report_age seconds since creation and no edits.\n";
      warn "\tThe user is blocked you see.\n" if ($is_blocked);
      delete(${$self->{params}{param}{delayed_names}}{$name});
      }
    else
      {
      warn "Scheduling recheck of $name\n";
      &{$self->{shared}{add_job}}([\&check_name,$name,$name],$self->{params}{param}{'Recheck interval'});
      }
    return 1;
    }
  unshift(@offense_reports, "== Account [[User:$name]] created ==") if $need_header;
  my $report = (join("\n",@offense_reports)." ~~~~\n");
  warn "Writing to: $target_page\n";
  my $page = $self->{WP_obj}->get($target_page, 'rw');
  if (($page->content() =~ m|<\!-- HBC NameWatcherBot v(\d+\.\d+\.\d+) allowed -->|) && !($is_blocked))
    {
    my $version_number_needed = $1;
    unless (check_version($version_number_needed,$self->{params}{version_number})) {
      warn "I need $version_number_needed and my version is ".$self->{params}{version_number}.".\n";
      warn "I am too old to edit this page.\n";
      return;
    }
    while ($page->{'content'} !~ m|\n\n$|) {$page->{'content'} = $page->{'content'}."\n";}
    $page->{'content'} = ($page->{'content'}.$report);
    $page->{'summary'} = "Reporting [[Special:Contributions/$name|$name]] ([[Special:Blockip/$name|block]]).";
    $page->save();
    warn ($page->{'summary'}."\n");
#    warn ($page->{'content'}."\n\n\n");
    sleep($self->{params}{param}{'Write rate'});
    warn "\n";
    }
  elsif ($is_blocked)
    {
    warn "Skipping, user is already blocked.\n";
    }
  else
    {
    warn "I don't have permission to write to '$target_page'\n";
    return;
    }
  }

sub user_has_edited
{
    my ($name) = @_;
    my $url = 'http://en.wikipedia.org/w/api.php?action=query&list=usercontribs&format=xml&ucuser='.uri_escape($name);
    my $data = $self->{WP_obj}->{ua}->get($url)->content();
    if (!$data || $data =~ m|<usercontribs\s*/>|) {
        # blank XML container, no contribs, return zero
        return 0;
    } else {
        return 1;
    }
}

sub is_blocked
  {
  my ($name) = @_;
  my $url = $self->{WP_obj}->{index}.'?title=Special:Ipblocklist&ip='.uri_escape($name);
  my $data = $self->{WP_obj}->{ua}->get($url)->content(); # Get blocklist info for user
  return ($data =~ m|</a>\) blocked <a href|)       # If the user is currently blocked
  }

sub detect_repeatition
  {
  my $string = shift;
  my $limit = shift;
  my $last_char;
  my $current_rep = 1;
  while (my $char = substr($string,0,1,''))
    {
    if ($char eq $last_char)
      {
      $current_rep++;
      }
    else
      {
      $last_char = $char;
      $current_rep = 1;
      }
    return $char if ($current_rep >= $limit);
    }
  return 0;
  }

sub detect_repeatition_numbers
  {
  my $string = shift;
  my $limit = shift;
  my $last_char;
  my $current_rep = 1;
  while (my $char = substr($string,0,1,''))
    {
    if ($char =~ m|^[0-9]$|)
      {
      $current_rep++;
      }
    else
      {
      $current_rep = 0;
      }
    return 1 if ($current_rep >= $limit);
    }
  return 0;
  }

sub compile_homoglyph_pattern {
  my $target_pattern = shift;
  my @homoglyphs = @{$self->{params}{homoglyphs}};
  while (scalar(@homoglyphs)) {
    my ($str, $ra_values) =  (shift(@homoglyphs),shift(@homoglyphs));
    next unless (scalar(@{$ra_values}));
    my $pattern = ('('.join('|', $str,@{$ra_values}).')');
    $target_pattern =~ s"$str"$pattern"ig; #"
  }
  return $target_pattern;
}

1;