use strict;
use warnings;
use constant N=>"\n";
use constant T=>"\t";
my $diag=0;
my $set='AC'; #AC DL MR SZ
$set=$ARGV[0] if $ARGV[0];
$diag=$ARGV[1] if $ARGV[1];
sub download {
my $home=shift;
require LWP::UserAgent;
my $ua = LWP::UserAgent->new;
$ua->timeout(1000); #Internet is under 200kbps in NZ
$ua->proxy(['http', 'ftp'], 'http://tur-cache.massey.ac.nz:8080/');
$ua->env_proxy;
my $response = $ua->get($home);
return retry_download($home,$response->status_line) if ! $response->is_success;
my $reply=$response->decoded_content;
return retry_download($home,'Message empty') if length($reply)<10;
return retry_download($home,'Serverside error') if ($reply=~m/Error\+11\+\(Resource\+temporarily\+unavailable\)/i);
return $reply; #really the internet actually works here?!
}
sub retry_download {
my $home=shift;
print 'Issue with '.$home.N;
print shift(@_).N;
print '1 minute pause...'.N;
sleep 60;
return download($home);
}
sub parse_LPSN_page {
#split into paragraphs...
my $file=shift;
my $name=shift;
my $cut='<a href="#r"><img border="0" src="top.jpg" width="23" height="11" alt="image"></a>';
my $col='<font color="#0000FF">';
my $colH='<font color="#FF0000">';
$file=~ s/.*Number of species cited in this file//sm;
$file=~ s/Copyright.*//sm;
my $total=0;
($file=~ m/^\:\s+(\d+)/) ? ($total=$1):(print 'Error... Tally not found'.N.$file.N);
my @species=split/\Q$cut\E/,$file;
my $about=shift(@species);
my ($author,$type,$ety,$doi,@spp); my $em='';
if ($about=~ m/$colH<i><b>$name<\/b><\/i><\/font>(.*)/) {$author=$1; $author=~s/[\r\n]//g} else {print "Cannot find authority with $col<i><b>$name<\/b><\/i><\/font>\n"; $em.=' parse error for authority'}
if ($about=~ m/$col<b>Type<\/b><\/font> $col<b>species<\/b>\:<\/font> <a .*?\/a> <i>$name<\/i> <i>(\w+)<\/i>/) {$type=$1} else {print "Cannot find type epitet\n"; $em.=' parse error for type species'}
if ($about=~ m/$col<b>Etymology<\/b>\:<\/font>(.*)/) {$ety=$1; $ety=~s/[\r\n]//g} else {print "Cannot find Etymology\n"; $em.=' parse error for etymology'}
if ($about=~ m/http\:\/\/dx\.doi\.org\/(.*?)\"/) {$doi=$1;}
elsif ($about=~ m/(http\:\/\/ijs\.sgmjournals\.org\/cgi\/reprint\/.*?)\"/) {$doi=$1;}
else {print "Cannot find Doi\n"; $em.=' parse error for doi'}
if ($about=~ m/---\>\;/) {print "Emendment detected\n"; $em.=' emendment'}
my $first=substr($name, 0, 1);
foreach my $sp (@species) {
my ($ln,$la,$le,$ld,$ll); my $lx='';
if ($sp=~ m/$name<\/b><\/i><\/font> $colH<i><b>(\w+)<\/b><\/i><\/font>(.*)/) {($ln,$la)=($1,$2); $la=~s/[\r\n]//g} else {next; print "Cannot find authority in $sp\n"; $lx.=' parse error for name'}
if ($sp=~ m/$col<b>Etymology<\/b>\:?<\/font>(.*)/) {$le=$1; $le=~s/[\r\n]//g} else {print "Cannot find Etymology in $sp\n"; $lx.=' parse error for etymology'}
if ($sp=~ m/http\:\/\/dx\.doi\.org\/(.*?)\"/) {$ld=$1;}
elsif ($sp=~ m/(http\:\/\/ijs\.sgmjournals\.org\/cgi\/reprint\/.*?)\"/) {$ld=$1;}
else {print "Cannot find Doi\n"; $lx.=' parse error for doi'}
if ($sp=~ m/---\>\;/) {print "Emendment detected\n"; $lx.=' emendment'}
$ll="* \[\[$name $ln|$first. $ln\]\] ($la\;$le)<ref>\{\{cite doi\|$ld\}\}</ref>";
$ll.="<!-- Manual check required due to$lx! -->" if $lx;
push(@spp,$ll)
}
return ($total,$author,substr($name,0,1).'. '.$type,$ety,$doi, $em,@spp);
}
#########################
my $q='"';
open(WIKI,'>',$set.'_wiki.txt') or die;
open(LIST,$set.'.html') or die "cannot open file\n\a";
foreach (grep(/Domain/,split(/<\/p>/,do { local $/; <LIST> }))) {
s/[\n\r]//msg;
s/Division or phylum/Division/;
s/Domain or empire/Domain/;
s/\ \;/ /g;
s/<span .*?>//g; #they have no use, I think
my @lines=split/<br \/>/;
my $temp=shift(@{[grep(/<a name/,@lines)]});
$temp=~m/<a href\=\"(\w+\/\w+.html)\"><font color\=\"\#FF0000\">(\w+)<\/font>/;
my ($glink,$genus)=($1,$2);
print "Error with $temp\n\a" if ! $glink;
print "Parsing $genus\n" if $diag;
my %tax=(genus=>[$genus,$glink]);
my ($total,$author,$type,$ety,$doi,$error,@spp);
if ($glink) {
if (!-e $glink) {open(PAGE,'>',$glink) or die "cannot make file $glink\n\a";print "downloading $genus from $glink\n"; print PAGE download('http://www.bacterio.cict.fr/'.$glink);close (PAGE);}
open(PAGE,$glink) or die "cannot open file $glink\n\a";
($total,$author,$type,$ety,$doi,$error,@spp)=parse_LPSN_page(do { local $/; <PAGE> },$genus);
close (PAGE);
} else {print "HELP!"}
foreach my $rank qw(Family Suborder Order Subclass Class Division Domain) {
my ($link,$taxa);
$temp=shift(@{[grep(/$rank\:/,@lines)]});
if (($temp !~ m/$rank\:\W+$/)&&($temp=~ m/$rank\:/)) {
if ($temp=~m/$rank\:\s+<a href\=\"([\w\#\/\.]+)\">(\w+)<\/a>/) {($link,$taxa)=($1,$2)}
elsif ($temp=~m/$rank\:\s+$q<a href\=\"([\w\#\/\.]+)\">(\w+)<\/a>$q/) {($link,$taxa)=($1,'"'.$2.'"')} #odd bug with $q
elsif ($temp=~m/$rank\:\s+$q<a href\=\"([\w\#\/\.]+)\"><font.*?>(\w+)<\/font><\/a>$q/) {($link,$taxa)=($1,'"'.$2.'"')} #does this mean anything different
elsif ($temp=~m/$rank\:\s+$q<i>(\w+)<\/i>$q/) {$taxa=$1}
else {print "\nImminent error\r";}
$error.=" $rank error" if ! $taxa;
$tax{$rank}=[$taxa,$link];
}
}
print WIKI "\n\n==\[\[$genus\]\]==\n<nowiki>\{\{italic title\}\}\n";
print WIKI "<!--Errors: $error -->\n" if $error;
print WIKI "\{\{Taxobox\n\| color \= lightgrey\n\| name \= ''$genus''\n";
print WIKI "\| domain \= \[\[".$tax{Domain}->[0]."\]\]\n" if $tax{Domain}->[0];
print WIKI "\| phylum \= \[\[".$tax{Division}->[0]."\]\]\n" if $tax{Division}->[0];
print WIKI "\| classis \= \[\[".$tax{Class}->[0]."\]\]\n" if $tax{Class}->[0];
print WIKI "\| subclassis \= \[\[".$tax{Subclass}->[0]."\]\]\n" if $tax{Subclass}->[0];
print WIKI "\| ordo \= \[\[".$tax{Order}->[0]."\]\]\n" if $tax{Order}->[0];
print WIKI "\| subordo \= \[\[".$tax{Suborder}->[0]."\]\]\n" if $tax{Suborder}->[0];
print WIKI "\| familia \= \[\[".$tax{Family}->[0]."\]\\n" if $tax{Family}->[0];
print WIKI "\| genus \= ''$genus''\n";
print WIKI "\| binomial_authority \= $author<ref>\{\{cite doi\|$doi\}\}<\/ref>" if $author;
print WIKI "\| type_species \= $type \n" if $type;
print WIKI "\| subdivision_ranks \= Species \n\}\}\n";
my $tp=$tax{Division}->[0]; $tp=~s/\"//g; my $td=$tax{Domain}->[0]; $td=~s/\"//g;
print WIKI "'''''$genus''''' is a genus in the phylum \[\[".$tp.']] ([['.$td.']]).<ref>{{lpsn|classification'.lc($set).'.html|Classification of Genera '.$set.'}}</ref>'.N;
print WIKI "The etymology of the genus is $ety.<ref name=main>\{\{lpsn\|$glink\|$genus\}\}<\/ref>\nThe genus contains $total species (including basonyms and synonyms), namely<ref name=main/>\n".join(N,@spp).N;
print WIKI '==See Also=='.N.'* [[Bacterial taxonomy]]\n* [[Microbiology]]'.N;
print WIKI '== References =='.N.'{{reflist}}'.N.'[[Category:'.$td.']]'.N.'[[Category:'.$tp.']]</nowiki>'.N;
}
print 'Done'.N;