User:Svgalbertian/Scripts/TSX

This is a perl script retrieve all active TSX stocks.
Code

use LWP::Simple;
use LWP::UserAgent;
use WWW::Mechanize;
use MediaWiki::API;

# Directory search variables
$dirURL = "http://www.tmxmoney.com/HttpController?GetPage=ListedCompanyDirectory&SearchCriteria=Name&SearchKeyword=\%char\%&SearchType=StartWith&Page=1&SearchIsMarket=Yes&Market=T&Language=en";
$companyURL = "http://tmx.quotemedia.com/company.php?qm_symbol=\%char\%&locale=EN";
$replaceme = '%char%';
@alpha = ("A" .. "Z", 0 .. 9);

# Initialize Mechanize
my $mech = WWW::Mechanize->new();

# Initialize Wikipedia
my $mw = MediaWiki::API->new();
$mw->{config}->{api_url} = 'http://en.wikipedia.org/w/api.php';

# Initialize LWP
my $browser = LWP::UserAgent->new();

# What is the date?
use POSIX qw(strftime);
$querydate = strftime "%B %e, %Y", localtime;

# Page elements
$footer = "|}\n{{Complete|$querydate}}\n\n==See also==\n*[[Toronto Stock Exchange]]\n*[[List of Canadian companies]]\n*[[List of mutual funds listed on the TSX]]\n*[[S&P/TSX Composite Index]]\n*[[List of companies listed on the TSX Venture Exchange]]\n\n==External links==\n* [http://www.tsx.ca Toronto Stock Exchange]\n\n{{DEFAULTSORT:Companies Listed On The Toronto Stock Exchange ($alpha)}}\n[[Category:Lists of companies listed on the Toronto Stock Exchange|$alpha]]";

# Create number page
open ( WIKINUM, ">wiki_numbers.txt" ) || die ("you die now!");
print WIKINUM "{{TSX listed stocks}}\n\n==0-9==\n{| style=\"background:transparent;\"\n!Stock Name\n!Symbol\n|----\n";

# Cycle through all the directory pages
foreach $alpha (@alpha) {
 # Create the URL of the directory page
 (my $currenturl = $dirURL) =~s/$replaceme/$alpha/g;

 # Create the letter pages
 open ( WIKILRT, ">wiki$alpha.txt" ) || die ("you die now!");
 print WIKILRT "{{TSX listed stocks}}\n\n==$alpha==\n{| style=\"background:transparent;\"\n!Stock Name\n!Symbol\n|----\n";

 open ( WIKILRT_NEW, ">wikialt$alpha.txt" ) || die ("you die now!");
 print WIKILRT_NEW "{{TSX listed stocks}}\n\n==$alpha==\n{| class=\"wikitable\"\n|-\n! COMPANY NAME !! SYMBOL !! INDUSTRY\n";

 # Grab the directory page
 $mech->get($currenturl);

 do {

  # Extract all links
  my @links = $mech->links();

  # Add matching company links to hash table
  foreach (@links) {
   if ($_->url() =~ m/company.php/) {

	# Grab company name from the link text	
	$company = $_->text();

	# Get the Wikipedia page
	my $page = $mw->get_page( { title => $company } );

	if($page->{'*'}=~ m{#(REDIRECT |REDIRECT)\[\[(.*)\]\]}i) {
	   print WIKILRT "|[[$2|" . $company . "]]\n";
	   print WIKINUM "|[[$2|" . $company . "]]\n";
	   print WIKILRT_NEW "|-\n|[[$2|" . $company . "]] || ";
	}
	elsif ($page->{'*'}) {
	   print WIKILRT "|[[" . $company . "]]\n";
	   print WIKINUM "|[[" . $company . "]]\n";
	   print WIKILRT_NEW "|-\n|[[" . $company . "]] || ";
	}
	else {
	   print WIKILRT "|" . $company . "\n";
	   print WIKINUM "|" . $company . "\n";
	   print WIKILRT_NEW "|-\n|" . $company . " || ";
	}

	# Parse the ticker from the URL
	$_->url() =~ m{=(.*?)&}gism;
	$ticker = $1;

        print WIKILRT "|{{tsx2|" . $ticker . "}}\n|----\n";
        print WIKINUM "|{{tsx2|" . $ticker . "}}\n|----\n";
        print WIKILRT_NEW "{{tsx2|" . $ticker . "}} || ";

	# New code to get the industry and website
	(my $currenturl = $companyURL) =~s/$replaceme/$ticker/g;
	my $request = HTTP::Request->new(GET => $currenturl);
	my $response = $browser->request($request);
	$contents = $response->content();

	$contents =~ m{Industry:</td>.*?>(.*?)</td>}gism;
	print "$company ($ticker), industry: $1 \n";
        print WIKILRT_NEW "$1 \n";

	#$contents =~ m{Website:</td>.*?href=\"(.*?)\"}gism;
	#print "Website: $1 \n";

   }
  }

 } while ( $mech->find_link(text => 'Next >') && $mech->follow_link(text => 'Next >') );

 print WIKILRT $footer;
 close (WIKILRT);

 print WIKILRT2 $footer;
 close (WIKILRT2);
}

print WIKINUM $footer;
close (WIKINUM);