[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [system-traders] Semi-OT: Symbol lists



Hi,
 
> Alternately, does someone know of a script to download and parse all
> the symbols from Yahoo?

The script is attached. Usages: perl build_market_list.pl 
<us|de|fr|..>

CU, Olf
-- 
Visit my world: http://www.olfsworld.de


#!/usr/bin/perl

# build_market_list.pl -- author: Oliver Bossert
# Copyright (C) 2004 Oliver Bossert
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

use strict;
use warnings;

use LWP::Simple;
use HTML::TableExtract;
use HTML::TreeBuilder;

my $market = shift;
my $BASE = "http://de.biz.yahoo.com";;
my $url = "http://de.biz.yahoo.com/p/"; . $market . "/cpi/index.html";

my @subpages = get_subpages( [ $url ] );
#print join("\n", @subpages) . "\n";
my @subsubpages = get_subpages( \
AT
subpages  );
#print join("\n", @subsubpages) . "\n";
push @subpages, , @subsubpages;

foreach my $page ( @subpages ) {
  print "## " . $page . "\n";
  foreach my $s ( get_stocks( $page ) ) {
    print join("\t", @$s) . "\n";
  }
}


sub get_stocks {
  my $url = shift;
  my $html_string = get( $url );

  my $count = 4;
  if ( $html_string =~ /<td>\[<\/td>/s ) {
    $count++;
  }

  my %yahoo = ();
  my $teh = new HTML::TableExtract(  depth => 0, count => $count, keep_html => 1 );
  $teh->parse($html_string);
  foreach my $ts ($teh->table_states) {
    foreach my $row ($ts->rows) {
      if ( defined($row->[2]) && $row->[2] =~ /q\?s=(.+?)&/ ) {
	$yahoo{$row->[1]} = $1;
      }
    }
  }

  my @res = ();
  my $te = new HTML::TableExtract(  depth => 0, count => $count );
  $te->parse($html_string);
  foreach my $ts ($te->table_states) {
    my $first = 1;
    foreach my $row ($ts->rows) {
      if ( $first == 1 ) {
	$first = 0;
	next;
      }
      my $name = shift @$row;
      my $code = shift @$row;
      #print $market . "\t" . $name . "\t" . $code . "\t" . $yahoo{$code} . "\t" . join("\t", @$row ), "\n";
      push @res, [$market, $name, $code, $yahoo{$code}, @$row ];
    }
  }

  return @res;
}


sub get_subpages {
  my $arref = shift;
  my @array = @$arref;
  my %result = ();

  my $tree = HTML::TreeBuilder->new; # empty tree
  foreach my $url ( @array ) {
    my $content = get( $url );
    $tree->parse($content);
    $tree->eof();

    foreach my $link ( $tree->look_down('_tag', 'a',
					sub {
					  return unless ( $_[0]->attr('href') );
					  return ( $_[0]->attr('href') =~ /\/cpi\// & $_[0]->attr('href') =~ /\/$market\// );
					} ) ) {
      $result{ $BASE . $link->attr('href') } = 1;
    }
  }
  $tree = $tree->delete;

  return sort keys %result;
}