[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [system-traders] Semi-OT: Symbol lists
Hi,
> Alternately, does someone know of a script to download and parse all
> the symbols from Yahoo?
The script is attached. Usages: perl build_market_list.pl
<us|de|fr|..>
CU, Olf
--
Visit my world: http://www.olfsworld.de
#!/usr/bin/perl
# build_market_list.pl -- author: Oliver Bossert
# Copyright (C) 2004 Oliver Bossert
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
use strict;
use warnings;
use LWP::Simple;
use HTML::TableExtract;
use HTML::TreeBuilder;
my $market = shift;
my $BASE = "http://de.biz.yahoo.com";
my $url = "http://de.biz.yahoo.com/p/" . $market . "/cpi/index.html";
my @subpages = get_subpages( [ $url ] );
#print join("\n", @subpages) . "\n";
my @subsubpages = get_subpages( \
AT
subpages );
#print join("\n", @subsubpages) . "\n";
push @subpages, , @subsubpages;
foreach my $page ( @subpages ) {
print "## " . $page . "\n";
foreach my $s ( get_stocks( $page ) ) {
print join("\t", @$s) . "\n";
}
}
sub get_stocks {
my $url = shift;
my $html_string = get( $url );
my $count = 4;
if ( $html_string =~ /<td>\[<\/td>/s ) {
$count++;
}
my %yahoo = ();
my $teh = new HTML::TableExtract( depth => 0, count => $count, keep_html => 1 );
$teh->parse($html_string);
foreach my $ts ($teh->table_states) {
foreach my $row ($ts->rows) {
if ( defined($row->[2]) && $row->[2] =~ /q\?s=(.+?)&/ ) {
$yahoo{$row->[1]} = $1;
}
}
}
my @res = ();
my $te = new HTML::TableExtract( depth => 0, count => $count );
$te->parse($html_string);
foreach my $ts ($te->table_states) {
my $first = 1;
foreach my $row ($ts->rows) {
if ( $first == 1 ) {
$first = 0;
next;
}
my $name = shift @$row;
my $code = shift @$row;
#print $market . "\t" . $name . "\t" . $code . "\t" . $yahoo{$code} . "\t" . join("\t", @$row ), "\n";
push @res, [$market, $name, $code, $yahoo{$code}, @$row ];
}
}
return @res;
}
sub get_subpages {
my $arref = shift;
my @array = @$arref;
my %result = ();
my $tree = HTML::TreeBuilder->new; # empty tree
foreach my $url ( @array ) {
my $content = get( $url );
$tree->parse($content);
$tree->eof();
foreach my $link ( $tree->look_down('_tag', 'a',
sub {
return unless ( $_[0]->attr('href') );
return ( $_[0]->attr('href') =~ /\/cpi\// & $_[0]->attr('href') =~ /\/$market\// );
} ) ) {
$result{ $BASE . $link->attr('href') } = 1;
}
}
$tree = $tree->delete;
return sort keys %result;
}