285 lines
7.7 KiB
Perl
285 lines
7.7 KiB
Perl
package HTML::TokeParser;
|
|
|
|
# $Id: TokeParser.pm,v 2.5 1999/06/09 10:20:02 gisle Exp $
|
|
|
|
require HTML::Parser;
|
|
@ISA=qw(HTML::Parser);
|
|
$VERSION = sprintf("%d.%02d", q$Revision: 2.5 $ =~ /(\d+)\.(\d+)/);
|
|
|
|
use strict;
|
|
use Carp qw(croak);
|
|
use HTML::Entities qw(decode_entities);
|
|
|
|
|
|
sub new
|
|
{
|
|
my $class = shift;
|
|
my $file = shift;
|
|
croak "Usage: $class->new(\$file)" unless defined $file;
|
|
if (!ref($file) && ref(\$file) ne "GLOB") {
|
|
require IO::File;
|
|
$file = IO::File->new($file, "r") || return;
|
|
}
|
|
my $self = $class->SUPER::new;
|
|
$self->{tokens} = [];
|
|
$self->{textify} = {img => "alt", applet => "alt"};
|
|
if (ref($file) eq "SCALAR") {
|
|
$self->parse($$file);
|
|
$self->eof;
|
|
} else {
|
|
$self->{file} = $file;
|
|
}
|
|
$self;
|
|
}
|
|
|
|
# Set up callback methods
|
|
for (qw(declaration start end text comment)) {
|
|
my $t = uc(substr($_,0,1));
|
|
no strict 'refs';
|
|
*$_ = sub { my $self = shift; push(@{$self->{tokens}}, [$t, @_]) };
|
|
}
|
|
|
|
|
|
sub get_token
|
|
{
|
|
my $self = shift;
|
|
while (!@{$self->{tokens}} && $self->{file}) {
|
|
# must try to parse more of the file
|
|
my $buf;
|
|
if (read($self->{file}, $buf, 512)) {
|
|
$self->parse($buf);
|
|
} else {
|
|
$self->eof;
|
|
delete $self->{file};
|
|
}
|
|
}
|
|
shift @{$self->{tokens}};
|
|
}
|
|
|
|
|
|
sub unget_token
|
|
{
|
|
my $self = shift;
|
|
unshift @{$self->{tokens}}, @_;
|
|
$self;
|
|
}
|
|
|
|
|
|
sub get_tag
|
|
{
|
|
my $self = shift;
|
|
my $wanted = shift;
|
|
my $token;
|
|
GET_TOKEN:
|
|
{
|
|
$token = $self->get_token;
|
|
if ($token) {
|
|
my $type = shift @$token;
|
|
redo GET_TOKEN if $type !~ /^[SE]$/;
|
|
substr($token->[0], 0, 0) = "/" if $type eq "E";
|
|
redo GET_TOKEN if defined($wanted) && $token->[0] ne $wanted;
|
|
}
|
|
}
|
|
$token;
|
|
}
|
|
|
|
|
|
sub get_text
|
|
{
|
|
my $self = shift;
|
|
my $endat = shift;
|
|
my @text;
|
|
while (my $token = $self->get_token) {
|
|
my $type = $token->[0];
|
|
if ($type eq "T") {
|
|
push(@text, decode_entities($token->[1]));
|
|
} elsif ($type =~ /^[SE]$/) {
|
|
my $tag = $token->[1];
|
|
if ($type eq "S") {
|
|
if (exists $self->{textify}{$tag}) {
|
|
my $alt = $self->{textify}{$tag};
|
|
my $text;
|
|
if (ref($alt)) {
|
|
$text = &$alt(@$token);
|
|
} else {
|
|
$text = $token->[2]{$alt || "alt"};
|
|
$text = "[\U$tag]" unless defined $text;
|
|
}
|
|
push(@text, $text);
|
|
next;
|
|
}
|
|
} else {
|
|
$tag = "/$tag";
|
|
}
|
|
if (!defined($endat) || $endat eq $tag) {
|
|
$self->unget_token($token);
|
|
last;
|
|
}
|
|
}
|
|
}
|
|
join("", @text);
|
|
}
|
|
|
|
|
|
sub get_trimmed_text
|
|
{
|
|
my $self = shift;
|
|
my $text = $self->get_text(@_);
|
|
$text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
|
|
$text;
|
|
}
|
|
|
|
1;
|
|
|
|
|
|
__END__
|
|
|
|
=head1 NAME
|
|
|
|
HTML::TokeParser - Alternative HTML::Parser interface
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
require HTML::TokeParser;
|
|
$p = HTML::TokeParser->new("index.html") || die "Can't open: $!";
|
|
while (my $token = $p->get_token) {
|
|
#...
|
|
}
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
The HTML::TokeParser is an alternative interface to the HTML::Parser class.
|
|
It basically turns the HTML::Parser inside out. You associate a file
|
|
(or any IO::Handle object or string) with the parser at construction time and
|
|
then repeatedly call $parser->get_token to obtain the tags and text
|
|
found in the parsed document. No need to make a subclass to make the
|
|
parser do anything.
|
|
|
|
Calling the methods defined by the HTML::Parser base class will be
|
|
confusing, so don't do that. Use the following methods instead:
|
|
|
|
=over 4
|
|
|
|
=item $p = HTML::TokeParser->new( $file_or_doc );
|
|
|
|
The object constructor argument is either a file name, a file handle
|
|
object, or the complete document to be parsed.
|
|
|
|
If the argument is a plain scalar, then it is taken as the name of a
|
|
file to be opened and parsed. If the file can't be opened for
|
|
reading, then the constructor will return an undefined value and $!
|
|
will tell you why it failed.
|
|
|
|
If the argument is a reference to a plain scalar, then this scalar is
|
|
taken to be the document to parse.
|
|
|
|
Otherwise the argument is taken to be some object that the
|
|
C<HTML::TokeParser> can read() from when it need more data. Typically
|
|
it will be a filehandle of some kind.The stream will be read() until
|
|
EOF, but not closed.
|
|
|
|
=item $p->get_token
|
|
|
|
This method will return the next I<token> found in the HTML document,
|
|
or C<undef> at the end of the document. The token is returned as an
|
|
array reference. The first element of the array will be a single
|
|
character string denoting the type of this token; "S" for start tag,
|
|
"E" for end tag, "T" for text, "C" for comment, and "D" for
|
|
declaration. The rest of the array is the same as the arguments
|
|
passed to the corresponding HTML::Parser callbacks (see
|
|
L<HTML::Parser>). This summarize the tokens that can occur:
|
|
|
|
["S", $tag, %$attr, @$attrseq, $origtext]
|
|
["E", $tag, $origtext]
|
|
["T", $text]
|
|
["C", $text]
|
|
["D", $text]
|
|
|
|
=item $p->unget_token($token,...)
|
|
|
|
If you find out you have read too many tokens you can push them back,
|
|
so that they are returned the next time $p->get_token is called.
|
|
|
|
=item $p->get_tag( [$tag] )
|
|
|
|
This method return the next start or end tag (skipping any other
|
|
tokens), or C<undef> if there is no more tags in the document. If an
|
|
argument is given, then we skip tokens until the specified tag is
|
|
found. A tag is returned as an array reference of the same form as
|
|
for $p->get_token above, but the type code (first element) is missing
|
|
and the name of end tags are prefixed with "/". This means that the
|
|
tags returned look like this:
|
|
|
|
[$tag, %$attr, @$attrseq, $origtext]
|
|
["/$tag", $origtext]
|
|
|
|
=item $p->get_text( [$endtag] )
|
|
|
|
This method returns all text found at the current position. It will
|
|
return a zero length string if the next token is not text. The
|
|
optional $endtag argument specify that any text occurring before the
|
|
given tag is to be returned. Any entities will be expanded to their
|
|
corresponding character.
|
|
|
|
The $p->{textify} attribute is a hash that define how certain tags can
|
|
be treated as text. If the name of a start tag match a key in this
|
|
hash then this tag is converted to text. The hash value is used to
|
|
specify which tag attribute to obtain the text from. If this tag
|
|
attribute is missing, then the upper case name of the tag enclosed in
|
|
brackets is returned, e.g. "[IMG]". The hash value can also be a
|
|
subroutine reference. In this case the routine is called with the
|
|
start tag token content as arguments and the return values is treated
|
|
as the text.
|
|
|
|
The default $p->{textify} value is:
|
|
|
|
{img => "alt", applet => "alt"}
|
|
|
|
This means that <IMG> and <APPLET> tags are treated as text, and that
|
|
the text to substitute can be found as ALT attribute.
|
|
|
|
=item $p->get_trimmed_text( [$endtag] )
|
|
|
|
Same as $p->get_text above, but will collapse any sequence of white
|
|
space to a single space character. Leading and trailing space is
|
|
removed.
|
|
|
|
=back
|
|
|
|
=head1 EXAMPLES
|
|
|
|
This example extract all links from a document. It will print one
|
|
line for each link, containing the URL and the textual description
|
|
between the <A>...</A> tags:
|
|
|
|
use HTML::TokeParser;
|
|
$p = HTML::TokeParser->new(shift||"index.html");
|
|
|
|
while (my $token = $p->get_tag("a")) {
|
|
my $url = $token->[1]{href} || "-";
|
|
my $text = $p->get_trimmed_text("/a");
|
|
print "$url\t$text\n";
|
|
}
|
|
|
|
This example extract the <TITLE> from the document:
|
|
|
|
use HTML::TokeParser;
|
|
$p = HTML::TokeParser->new(shift||"index.html");
|
|
if ($p->get_tag("title")) {
|
|
my $title = $p->get_trimmed_text;
|
|
print "Title: $title\n";
|
|
}
|
|
|
|
=head1 SEE ALSO
|
|
|
|
L<HTML::Parser>
|
|
|
|
=head1 COPYRIGHT
|
|
|
|
Copyright 1998-1999 Gisle Aas.
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
modify it under the same terms as Perl itself.
|
|
|
|
=cut
|