442 lines
12 KiB
Perl
442 lines
12 KiB
Perl
package HTML::Parser;
|
|
|
|
# Author address: <gisle@aas.no>
|
|
|
|
use strict;
|
|
use HTML::Entities ();
|
|
|
|
use vars qw($VERSION);
|
|
$VERSION = "2.23"; # $Date: 1999/06/09 10:27:16 $
|
|
|
|
|
|
sub new
|
|
{
|
|
my $class = shift;
|
|
my $self = bless { '_buf' => '',
|
|
'_strict_comment' => 0,
|
|
}, $class;
|
|
$self;
|
|
}
|
|
|
|
|
|
# A little note about the observed Netscape behaviour:
|
|
#
|
|
# It parse <xmp> in the depreceated 'literal' mode, i.e. no tags are
|
|
# recognized until a </xmp> is found.
|
|
#
|
|
# <listing> is parsed like <pre>, i.e. tags are recognized. <listing>
|
|
# are presentend in smaller font than <pre>
|
|
#
|
|
# Netscape does not parse this comment correctly (it terminates the comment
|
|
# too early):
|
|
#
|
|
# <! -- comment -- --> more comment -->
|
|
#
|
|
# Netscape ignores '<!--' and '-->' within the <SCRIPT> and <STYLE> tag.
|
|
# This is used as a trick to make non-script-aware browsers ignore
|
|
# the scripts.
|
|
|
|
|
|
sub parse
|
|
{
|
|
my $self = shift;
|
|
my $buf = \ $self->{'_buf'};
|
|
unless (defined $_[0]) {
|
|
# signals EOF (assume rest is plain text)
|
|
$self->text($$buf) if length $$buf;
|
|
$$buf = '';
|
|
return $self;
|
|
}
|
|
$$buf .= $_[0];
|
|
my $netscape_comment = !$self->{'_strict_comment'};
|
|
|
|
# Parse html text in $$buf. The strategy is to remove complete
|
|
# tokens from the beginning of $$buf until we can't deside whether
|
|
# it is a token or not, or the $$buf is empty.
|
|
|
|
TOKEN:
|
|
while (1) {
|
|
|
|
# First we try to pull off any plain text (anything before a "<" char)
|
|
if ($$buf =~ s|^([^<]+)||) {
|
|
if (length $$buf) {
|
|
$self->text($1);
|
|
} else {
|
|
my $text = $1;
|
|
# At the end of the buffer, we should not parse white space
|
|
# but leave it for parsing on the next round.
|
|
if ($text =~ s|(\s+)$||) {
|
|
$$buf = $1;
|
|
# Same treatment for chopped up entites and words.
|
|
# We must wait until we have it all.
|
|
} elsif ($text =~ s|(\s*\S+)$||) {
|
|
$$buf = $1;
|
|
};
|
|
$self->text($text) if length $text;
|
|
last TOKEN;
|
|
}
|
|
|
|
# Netscapes buggy comments are easy to handle
|
|
} elsif ($netscape_comment && $$buf =~ m|^<!\s*--|) {
|
|
if ($$buf =~ s|^<!\s*--(.*?)--\s*>||s) {
|
|
$self->comment($1);
|
|
} else {
|
|
last TOKEN; # must wait until we see the end of it
|
|
}
|
|
|
|
# Then, markup declarations (usually either <!DOCTYPE...> or a comment)
|
|
} elsif ($$buf =~ s|^(<!)||) {
|
|
my $eaten = $1;
|
|
my $text = '';
|
|
my @com = (); # keeps comments until we have seen the end
|
|
# Eat text and beginning of comment
|
|
while ($$buf =~ s|^(([^>]*?)--)||) {
|
|
$eaten .= $1;
|
|
$text .= $2;
|
|
# Look for end of comment
|
|
if ($$buf =~ s|^((.*?)--)||s) {
|
|
$eaten .= $1;
|
|
push(@com, $2);
|
|
} else {
|
|
# Need more data to get all comment text.
|
|
$$buf = $eaten . $$buf;
|
|
last TOKEN;
|
|
}
|
|
}
|
|
# Can we finish the tag
|
|
if ($$buf =~ s|^([^>]*)>||) {
|
|
$text .= $1;
|
|
$self->declaration($text) if $text =~ /\S/;
|
|
# then tell about all the comments we found
|
|
for (@com) { $self->comment($_); }
|
|
} else {
|
|
$$buf = $eaten . $$buf; # must start with it all next time
|
|
last TOKEN;
|
|
}
|
|
|
|
# Should we look for 'processing instructions' <? ...> ??
|
|
#} elsif ($$buf =~ s|<\?||) {
|
|
# ...
|
|
|
|
# Then, look for a end tag
|
|
} elsif ($$buf =~ s|^</||) {
|
|
# end tag
|
|
if ($$buf =~ s|^([a-zA-Z][a-zA-Z0-9\.\-]*)(\s*>)||) {
|
|
$self->end(lc($1), "</$1$2");
|
|
} elsif ($$buf =~ m|^[a-zA-Z]*[a-zA-Z0-9\.\-]*\s*$|) {
|
|
$$buf = "</" . $$buf; # need more data to be sure
|
|
last TOKEN;
|
|
} else {
|
|
# it is plain text after all
|
|
$self->text("</");
|
|
}
|
|
|
|
# Then, finally we look for a start tag
|
|
} elsif ($$buf =~ s|^(<([a-zA-Z]+)>)||) {
|
|
# special case plain start tags for slight speed-up (2.5%)
|
|
$self->start(lc($2), {}, [], $1);
|
|
|
|
} elsif ($$buf =~ s|^<||) {
|
|
# start tag
|
|
my $eaten = '<';
|
|
|
|
# This first thing we must find is a tag name. RFC1866 says:
|
|
# A name consists of a letter followed by letters,
|
|
# digits, periods, or hyphens. The length of a name is
|
|
# limited to 72 characters by the `NAMELEN' parameter in
|
|
# the SGML declaration for HTML, 9.5, "SGML Declaration
|
|
# for HTML". In a start-tag, the element name must
|
|
# immediately follow the tag open delimiter `<'.
|
|
if ($$buf =~ s|^(([a-zA-Z][a-zA-Z0-9\.\-]*)\s*)||) {
|
|
$eaten .= $1;
|
|
my $tag = lc $2;
|
|
my %attr;
|
|
my @attrseq;
|
|
|
|
# Then we would like to find some attributes
|
|
#
|
|
# Arrgh!! Since stupid Netscape violates RCF1866 by
|
|
# using "_" in attribute names (like "ADD_DATE") of
|
|
# their bookmarks.html, we allow this too.
|
|
while ($$buf =~ s|^(([a-zA-Z][a-zA-Z0-9\.\-_]*)\s*)||) {
|
|
$eaten .= $1;
|
|
my $attr = lc $2;
|
|
my $val;
|
|
# The attribute might take an optional value (first we
|
|
# check for an unquoted value)
|
|
if ($$buf =~ s|(^=\s*([^\"\'>\s][^>\s]*)\s*)||) {
|
|
$eaten .= $1;
|
|
$val = $2;
|
|
HTML::Entities::decode($val);
|
|
# or quoted by " or '
|
|
} elsif ($$buf =~ s|(^=\s*([\"\'])(.*?)\2\s*)||s) {
|
|
$eaten .= $1;
|
|
$val = $3;
|
|
HTML::Entities::decode($val);
|
|
# truncated just after the '=' or inside the attribute
|
|
} elsif ($$buf =~ m|^(=\s*)$| or
|
|
$$buf =~ m|^(=\s*[\"\'].*)|s) {
|
|
$$buf = "$eaten$1";
|
|
last TOKEN;
|
|
} else {
|
|
# assume attribute with implicit value
|
|
$val = $attr;
|
|
}
|
|
$attr{$attr} = $val;
|
|
push(@attrseq, $attr);
|
|
}
|
|
|
|
# At the end there should be a closing ">"
|
|
if ($$buf =~ s|^>||) {
|
|
$self->start($tag, \%attr, \@attrseq, "$eaten>");
|
|
} elsif (length $$buf) {
|
|
# Not a conforming start tag, regard it as normal text
|
|
$self->text($eaten);
|
|
} else {
|
|
$$buf = $eaten; # need more data to know
|
|
last TOKEN;
|
|
}
|
|
|
|
} elsif (length $$buf) {
|
|
$self->text($eaten);
|
|
} else {
|
|
$$buf = $eaten . $$buf; # need more data to parse
|
|
last TOKEN;
|
|
}
|
|
|
|
} else {
|
|
#die if length($$buf); # This should never happen
|
|
last TOKEN; # The buffer should be empty now
|
|
}
|
|
}
|
|
|
|
$self;
|
|
}
|
|
|
|
|
|
sub eof
|
|
{
|
|
shift->parse(undef);
|
|
}
|
|
|
|
|
|
sub parse_file
|
|
{
|
|
my($self, $file) = @_;
|
|
no strict 'refs'; # so that a symbol ref as $file works
|
|
local(*F);
|
|
unless (ref($file) || $file =~ /^\*[\w:]+$/) {
|
|
# Assume $file is a filename
|
|
open(F, $file) || die "Can't open $file: $!";
|
|
$file = \*F;
|
|
}
|
|
my $chunk = '';
|
|
while(read($file, $chunk, 512)) {
|
|
$self->parse($chunk);
|
|
}
|
|
close($file);
|
|
$self->eof;
|
|
}
|
|
|
|
|
|
sub strict_comment
|
|
{
|
|
my $self = shift;
|
|
my $old = $self->{'_strict_comment'};
|
|
$self->{'_strict_comment'} = shift if @_;
|
|
return $old;
|
|
}
|
|
|
|
|
|
sub netscape_buggy_comment # legacy
|
|
{
|
|
my $self = shift;
|
|
my $old = !$self->strict_comment;
|
|
$self->strict_comment(!shift) if @_;
|
|
return $old;
|
|
}
|
|
|
|
|
|
sub text
|
|
{
|
|
# my($self, $text) = @_;
|
|
}
|
|
|
|
sub declaration
|
|
{
|
|
# my($self, $decl) = @_;
|
|
}
|
|
|
|
sub comment
|
|
{
|
|
# my($self, $comment) = @_;
|
|
}
|
|
|
|
sub start
|
|
{
|
|
# my($self, $tag, $attr, $attrseq, $origtext) = @_;
|
|
# $attr is reference to a HASH, $attrseq is reference to an ARRAY
|
|
}
|
|
|
|
sub end
|
|
{
|
|
# my($self, $tag, $origtext) = @_;
|
|
}
|
|
|
|
1;
|
|
|
|
|
|
__END__
|
|
|
|
|
|
=head1 NAME
|
|
|
|
HTML::Parser - SGML parser class
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
require HTML::Parser;
|
|
$p = HTML::Parser->new; # should really a be subclass
|
|
$p->parse($chunk1);
|
|
$p->parse($chunk2);
|
|
#...
|
|
$p->eof; # signal end of document
|
|
|
|
# Parse directly from file
|
|
$p->parse_file("foo.html");
|
|
# or
|
|
open(F, "foo.html") || die;
|
|
$p->parse_file(\*F);
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
The C<HTML::Parser> will tokenize an HTML document when the parse()
|
|
method is called by invoking various callback methods. The document to
|
|
be parsed can be supplied in arbitrary chunks.
|
|
|
|
The external interface the an I<HTML::Parser> is:
|
|
|
|
=over 4
|
|
|
|
=item $p = HTML::Parser->new
|
|
|
|
The object constructor takes no arguments.
|
|
|
|
=item $p->parse( $string );
|
|
|
|
Parse the $string as an HTML document. Can be called multiple times.
|
|
The return value is a reference to the parser object.
|
|
|
|
=item $p->eof
|
|
|
|
Signals end of document. Call eof() to flush any remaining buffered
|
|
text. The return value is a reference to the parser object.
|
|
|
|
=item $p->parse_file( $file );
|
|
|
|
This method can be called to parse text from a file. The argument can
|
|
be a filename or an already opened file handle. The return value from
|
|
parse_file() is a reference to the parser object.
|
|
|
|
=item $p->strict_comment( [$bool] )
|
|
|
|
By default we parse comments similar to how the popular browsers (like
|
|
Netscape and MSIE) do it. This means that comments will always be
|
|
terminated by the first occurrence of "-->". This is not correct
|
|
according to the "official" HTML standards. The official behaviour
|
|
can be enabled by calling the strict_comment() method with a TRUE
|
|
argument.
|
|
|
|
The return value from strict_comment() is the old attribute value.
|
|
|
|
=back
|
|
|
|
|
|
|
|
In order to make the parser do anything interesting, you must make a
|
|
subclass where you override one or more of the following methods as
|
|
appropriate:
|
|
|
|
=over 4
|
|
|
|
=item $self->declaration($decl)
|
|
|
|
This method is called when a I<markup declaration> has been
|
|
recognized. For typical HTML documents, the only declaration you are
|
|
likely to find is <!DOCTYPE ...>. The initial "<!" and ending ">" is
|
|
not part of the string passed as argument. Comments are removed and
|
|
entities will B<not> be expanded.
|
|
|
|
=item $self->start($tag, $attr, $attrseq, $origtext)
|
|
|
|
This method is called when a complete start tag has been recognized.
|
|
The first argument is the tag name (in lower case) and the second
|
|
argument is a reference to a hash that contain all attributes found
|
|
within the start tag. The attribute keys are converted to lower case.
|
|
Entities found in the attribute values are already expanded. The
|
|
third argument is a reference to an array with the lower case
|
|
attribute keys in the original order. The fourth argument is the
|
|
original HTML text.
|
|
|
|
|
|
=item $self->end($tag, $origtext)
|
|
|
|
This method is called when an end tag has been recognized. The
|
|
first argument is the lower case tag name, the second the original
|
|
HTML text of the tag.
|
|
|
|
=item $self->text($text)
|
|
|
|
This method is called when plain text in the document is recognized.
|
|
The text is passed on unmodified and might contain multiple lines.
|
|
Note that for efficiency reasons entities in the text are B<not>
|
|
expanded. You should call HTML::Entities::decode($text) before you
|
|
process the text any further.
|
|
|
|
A sequence of text in the HTML document can be broken between several
|
|
invocations of $self->text. The parser will make sure that it does
|
|
not break a word or a sequence of spaces between two invocations of
|
|
$self->text().
|
|
|
|
=item $self->comment($comment)
|
|
|
|
This method is called as comments are recognized. The leading and
|
|
trailing "--" sequences have been stripped off the comment text.
|
|
|
|
=back
|
|
|
|
The default implementation of these methods do nothing, i.e., the
|
|
tokens are just ignored.
|
|
|
|
There is really nothing in the basic parser that is HTML specific, so
|
|
it is likely that the parser can parse other kinds of SGML documents.
|
|
SGML has many obscure features (not implemented by this module) that
|
|
prevent us from renaming this module as C<SGML::Parser>.
|
|
|
|
=head1 EFFICIENCY
|
|
|
|
The parser is fairly inefficient if the chunks passed to $p->parse()
|
|
are too big. The reason is probably that perl ends up with a lot of
|
|
character copying when tokens are removed from the beginning of the
|
|
strings. A chunk size of about 256-512 bytes was optimal in a test I
|
|
made with some real world HTML documents. (The parser was about 3
|
|
times slower with a chunk size of 20K).
|
|
|
|
=head1 SEE ALSO
|
|
|
|
L<HTML::Entities>, L<HTML::TokeParser>, L<HTML::Filter>,
|
|
L<HTML::HeadParser>, L<HTML::LinkExtor>
|
|
|
|
L<HTML::TreeBuilder> (part of the I<HTML-Tree> distribution)
|
|
|
|
=head1 COPYRIGHT
|
|
|
|
Copyright 1996-1999 Gisle Aas. All rights reserved.
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
modify it under the same terms as Perl itself.
|
|
|
|
=cut
|
|
|
|
|