This is a perl script retrieve all active TSX stocks.
Code
edituse LWP::Simple; use LWP::UserAgent; use WWW::Mechanize; use MediaWiki::API; # Directory search variables $dirURL = "http://www.tmxmoney.com/HttpController?GetPage=ListedCompanyDirectory&SearchCriteria=Name&SearchKeyword=\%char\%&SearchType=StartWith&Page=1&SearchIsMarket=Yes&Market=T&Language=en"; $companyURL = "http://tmx.quotemedia.com/company.php?qm_symbol=\%char\%&locale=EN"; $replaceme = '%char%'; @alpha = ("A" .. "Z", 0 .. 9); # Initialize Mechanize my $mech = WWW::Mechanize->new(); # Initialize Wikipedia my $mw = MediaWiki::API->new(); $mw->{config}->{api_url} = 'http://en.wikipedia.org/w/api.php'; # Initialize LWP my $browser = LWP::UserAgent->new(); # What is the date? use POSIX qw(strftime); $querydate = strftime "%B %e, %Y", localtime; # Page elements $footer = "|}\n{{Complete|$querydate}}\n\n==See also==\n*[[Toronto Stock Exchange]]\n*[[List of Canadian companies]]\n*[[List of mutual funds listed on the TSX]]\n*[[S&P/TSX Composite Index]]\n*[[List of companies listed on the TSX Venture Exchange]]\n\n==External links==\n* [http://www.tsx.ca Toronto Stock Exchange]\n\n{{DEFAULTSORT:Companies Listed On The Toronto Stock Exchange ($alpha)}}\n[[Category:Lists of companies listed on the Toronto Stock Exchange|$alpha]]"; # Create number page open ( WIKINUM, ">wiki_numbers.txt" ) || die ("you die now!"); print WIKINUM "{{TSX listed stocks}}\n\n==0-9==\n{| style=\"background:transparent;\"\n!Stock Name\n!Symbol\n|----\n"; # Cycle through all the directory pages foreach $alpha (@alpha) { # Create the URL of the directory page (my $currenturl = $dirURL) =~s/$replaceme/$alpha/g; # Create the letter pages open ( WIKILRT, ">wiki$alpha.txt" ) || die ("you die now!"); print WIKILRT "{{TSX listed stocks}}\n\n==$alpha==\n{| style=\"background:transparent;\"\n!Stock Name\n!Symbol\n|----\n"; open ( WIKILRT_NEW, ">wikialt$alpha.txt" ) || die ("you die now!"); print WIKILRT_NEW "{{TSX listed stocks}}\n\n==$alpha==\n{| class=\"wikitable\"\n|-\n! COMPANY NAME !! SYMBOL !! INDUSTRY\n"; # Grab the directory page $mech->get($currenturl); do { # Extract all links my @links = $mech->links(); # Add matching company links to hash table foreach (@links) { if ($_->url() =~ m/company.php/) { # Grab company name from the link text $company = $_->text(); # Get the Wikipedia page my $page = $mw->get_page( { title => $company } ); if($page->{'*'}=~ m{#(REDIRECT |REDIRECT)\[\[(.*)\]\]}i) { print WIKILRT "|[[$2|" . $company . "]]\n"; print WIKINUM "|[[$2|" . $company . "]]\n"; print WIKILRT_NEW "|-\n|[[$2|" . $company . "]] || "; } elsif ($page->{'*'}) { print WIKILRT "|[[" . $company . "]]\n"; print WIKINUM "|[[" . $company . "]]\n"; print WIKILRT_NEW "|-\n|[[" . $company . "]] || "; } else { print WIKILRT "|" . $company . "\n"; print WIKINUM "|" . $company . "\n"; print WIKILRT_NEW "|-\n|" . $company . " || "; } # Parse the ticker from the URL $_->url() =~ m{=(.*?)&}gism; $ticker = $1; print WIKILRT "|{{tsx2|" . $ticker . "}}\n|----\n"; print WIKINUM "|{{tsx2|" . $ticker . "}}\n|----\n"; print WIKILRT_NEW "{{tsx2|" . $ticker . "}} || "; # New code to get the industry and website (my $currenturl = $companyURL) =~s/$replaceme/$ticker/g; my $request = HTTP::Request->new(GET => $currenturl); my $response = $browser->request($request); $contents = $response->content(); $contents =~ m{Industry:</td>.*?>(.*?)</td>}gism; print "$company ($ticker), industry: $1 \n"; print WIKILRT_NEW "$1 \n"; #$contents =~ m{Website:</td>.*?href=\"(.*?)\"}gism; #print "Website: $1 \n"; } } } while ( $mech->find_link(text => 'Next >') && $mech->follow_link(text => 'Next >') ); print WIKILRT $footer; close (WIKILRT); print WIKILRT2 $footer; close (WIKILRT2); } print WIKINUM $footer; close (WIKINUM);