# XML::Parser # # Copyright (c) 1998 Larry Wall and Clark Cooper # All rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. package XML::Parser; use vars qw($VERSION %Built_In_Styles $have_LWP); use Carp; BEGIN { require XML::Parser::Expat; $VERSION = '2.27'; die "Parser.pm and Expat.pm versions don't match" unless $VERSION eq $XML::Parser::Expat::VERSION; eval { require 'LWP.pm'; require 'URI/URL.pm'; }; $have_LWP = not $@; if ($have_LWP) { import LWP; } } use strict; sub new { my ($class, %args) = @_; my $style = $args{Style}; my $nonexopt = $args{Non_Expat_Options} ||= {}; $nonexopt->{Style} = 1; $nonexopt->{Non_Expat_Options} = 1; $nonexopt->{Handlers} = 1; $nonexopt->{_HNDL_TYPES} = 1; $args{_HNDL_TYPES} = {%XML::Parser::Expat::Handler_Setters}; $args{_HNDL_TYPES}->{Init} = 1; $args{_HNDL_TYPES}->{Final} = 1; $args{Handlers} ||= {}; my $handlers = $args{Handlers}; if (defined($style)) { my $stylepkg = $style; if ($stylepkg !~ /::/) { $stylepkg = "\u$style"; # I'm using the Built_In_Styles hash to define # valid internal styles, since a style doesn't # need to define any particular Handler to be valid. # So I can't check for the existence of a particular sub. croak "Undefined style: $style" unless defined($Built_In_Styles{$stylepkg}); $stylepkg = 'XML::Parser::' . $stylepkg; } my $htype; foreach $htype (keys %{$args{_HNDL_TYPES}}) { # Handlers explicity given override # handlers from the Style package unless (defined($handlers->{$htype})) { # A handler in the style package must either have # exactly the right case as the type name or a # completely lower case version of it. my $hname = "${stylepkg}::$htype"; if (defined(&$hname)) { $handlers->{$htype} = \&$hname; next; } $hname = "${stylepkg}::\L$htype"; if (defined(&$hname)) { $handlers->{$htype} = \&$hname; next; } } } } $handlers->{ExternEnt} ||= ($have_LWP ? \&lwp_ext_ent_handler : \&file_ext_ent_handler); $args{Pkg} ||= caller; bless \%args, $class; } # End of new sub setHandlers { my ($self, @handler_pairs) = @_; croak("Uneven number of arguments to setHandlers method") if (int(@handler_pairs) & 1); my @ret; while (@handler_pairs) { my $type = shift @handler_pairs; my $handler = shift @handler_pairs; unless (defined($self->{_HNDL_TYPES}->{$type})) { my @types = sort keys %{$self->{_HNDL_TYPES}}; croak("Unknown Parser handler type: $type\n Valid types: @types"); } push(@ret, $type, $self->{Handlers}->{$type}); $self->{Handlers}->{$type} = $handler; } return @ret; } # End of setHandlers sub parse_start { my $self = shift; my @expat_options = (); my ($key, $val); while (($key, $val) = each %{$self}) { push (@expat_options, $key, $val) unless exists $self->{Non_Expat_Options}->{$key}; } my %handlers = %{$self->{Handlers}}; my $init = delete $handlers{Init}; my $final = delete $handlers{Final}; my $expatnb = new XML::Parser::ExpatNB(@expat_options, @_); $expatnb->setHandlers(%handlers); &$init($expatnb) if defined($init); $expatnb->{FinalHandler} = $final if defined($final); return $expatnb; } sub parse { my $self = shift; my $arg = shift; my @expat_options = (); my ($key, $val); while (($key, $val) = each %{$self}) { push(@expat_options, $key, $val) unless exists $self->{Non_Expat_Options}->{$key}; } my $expat = new XML::Parser::Expat(@expat_options, @_); my %handlers = %{$self->{Handlers}}; my $init = delete $handlers{Init}; my $final = delete $handlers{Final}; $expat->setHandlers(%handlers); &$init($expat) if defined($init); my @result = (); my $result = $expat->parse($arg); if ($result and defined($final)) { if (wantarray) { @result = &$final($expat); } else { $result = &$final($expat); } } $expat->release; return unless defined wantarray; return wantarray ? @result : $result; } # End of parse sub parsestring { my $self = shift; $self->parse(@_); } # End of parsestring sub parsefile { my $self = shift; my $file = shift; local(*FILE); open(FILE, $file) or croak "Couldn't open $file:\n$!"; binmode(FILE); my @ret; my $ret; if (wantarray) { eval { @ret = $self->parse(*FILE, @_); }; } else { eval { $ret = $self->parse(*FILE, @_); }; } my $err = $@; close(FILE); die $err if $err; return unless defined wantarray; return wantarray ? @ret : $ret; } # End of parsefile my %External_Entity_Table = (); sub file_ext_ent_handler { my ($exp, $base, $sys) = @_; # We don't use pub id local(*ENT); my $name = $sys; # Prepend base only for relative URLs if (defined($base) and not ($name =~ m!^(?:/|\w+:)!)) { $name = $base . $sys; } if ($name =~ s/^(\w+)://) { my $method = $1; unless ($method eq 'file') { $exp->{ErrorMessage} .= "\nDefault external entity handler only deals with file URLs."; return undef; } } if ($name =~ /^[|>+]/ or $name =~ /\|$/) { $exp->{ErrorMessage} .= "Perl IO controls not permitted in system id"; return undef; } my $def = $External_Entity_Table{$name}; if (! defined($def)) { unless (open(ENT, $name)) { $exp->{ErrorMessage} .= "Failed to open $name: $!"; return undef; } my $status; $status = read(ENT, $def, -s $name); close(ENT); unless (defined($status)) { $exp->{ErrorMessage} .= "Error reading $name: $!"; return undef; } $External_Entity_Table{$name} = $def; } return $def; } # End file_ext_ent_handler ## ## Note that this external entity handler reads the entire entity into ## memory, so it will choke on huge ones. ## sub lwp_ext_ent_handler { my ($exp, $base, $sys) = @_; # We don't use public id my $uri = new URI::URL($sys); if (not $uri->scheme and $base) { $uri = $uri->abs($base); } my $scheme = $uri->scheme; if (not $scheme or $scheme eq 'file') { return file_ext_ent_handler($exp, $base, $sys); } my $ua = ($exp->{_lwpagent} ||= new LWP::UserAgent()); my $req = new HTTP::Request('GET', $uri); my $res = $ua->request($req); if ($res->is_error) { $exp->{ErrorMessage} .= "\n" . $res->status_line; return undef; } return $res->content; } # End lwp_ext_ent_handler ################################################################### package XML::Parser::Debug; $XML::Parser::Built_In_Styles{Debug} = 1; sub Start { my $expat = shift; my $tag = shift; print STDERR "@{$expat->{Context}} \\\\ (@_)\n"; } sub End { my $expat = shift; my $tag = shift; print STDERR "@{$expat->{Context}} //\n"; } sub Char { my $expat = shift; my $text = shift; $text =~ s/([\x80-\xff])/sprintf "#x%X;", ord $1/eg; $text =~ s/([\t\n])/sprintf "#%d;", ord $1/eg; print STDERR "@{$expat->{Context}} || $text\n"; } sub Proc { my $expat = shift; my $target = shift; my $text = shift; print $expat,"\n"; print $expat->{Context}, "\n"; my @foo = @{$expat->{Context}}; print STDERR "@foo $target($text)\n"; } ################################################################### package XML::Parser::Subs; $XML::Parser::Built_In_Styles{Subs} = 1; sub Start { no strict 'refs'; my $expat = shift; my $tag = shift; my $sub = $expat->{Pkg} . "::$tag"; eval { &$sub($expat, $tag, @_) }; } sub End { no strict 'refs'; my $expat = shift; my $tag = shift; my $sub = $expat->{Pkg} . "::${tag}_"; eval { &$sub($expat, $tag) }; } ################################################################### package XML::Parser::Tree; $XML::Parser::Built_In_Styles{Tree} = 1; sub Init { my $expat = shift; $expat->{Lists} = []; $expat->{Curlist} = $expat->{Tree} = []; } sub Start { my $expat = shift; my $tag = shift; my $newlist = [ { @_ } ]; push @{ $expat->{Lists} }, $expat->{Curlist}; push @{ $expat->{Curlist} }, $tag => $newlist; $expat->{Curlist} = $newlist; } sub End { my $expat = shift; my $tag = shift; $expat->{Curlist} = pop @{ $expat->{Lists} }; } sub Char { my $expat = shift; my $text = shift; my $clist = $expat->{Curlist}; my $pos = $#$clist; if ($pos > 0 and $clist->[$pos - 1] eq '0') { $clist->[$pos] .= $text; } else { push @$clist, 0 => $text; } } sub Final { my $expat = shift; delete $expat->{Curlist}; delete $expat->{Lists}; $expat->{Tree}; } ################################################################### package XML::Parser::Objects; $XML::Parser::Built_In_Styles{Objects} = 1; sub Init { my $expat = shift; $expat->{Lists} = []; $expat->{Curlist} = $expat->{Tree} = []; } sub Start { my $expat = shift; my $tag = shift; my $newlist = [ ]; my $class = "${$expat}{Pkg}::$tag"; my $newobj = bless { @_, Kids => $newlist }, $class; push @{ $expat->{Lists} }, $expat->{Curlist}; push @{ $expat->{Curlist} }, $newobj; $expat->{Curlist} = $newlist; } sub End { my $expat = shift; my $tag = shift; $expat->{Curlist} = pop @{ $expat->{Lists} }; } sub Char { my $expat = shift; my $text = shift; my $class = "${$expat}{Pkg}::Characters"; my $clist = $expat->{Curlist}; my $pos = $#$clist; if ($pos >= 0 and ref($clist->[$pos]) eq $class) { $clist->[$pos]->{Text} .= $text; } else { push @$clist, bless { Text => $text }, $class; } } sub Final { my $expat = shift; delete $expat->{Curlist}; delete $expat->{Lists}; $expat->{Tree}; } ################################################################ package XML::Parser::Stream; $XML::Parser::Built_In_Styles{Stream} = 1; # This style invented by Tim Bray sub Init { no strict 'refs'; my $expat = shift; $expat->{Text} = ''; my $sub = $expat->{Pkg} ."::StartDocument"; &$sub($expat) if defined(&$sub); } sub Start { no strict 'refs'; my $expat = shift; my $type = shift; doText($expat); $_ = "<$type"; %_ = @_; while (@_) { $_ .= ' ' . shift() . '="' . shift() . '"'; } $_ .= '>'; my $sub = $expat->{Pkg} . "::StartTag"; if (defined(&$sub)) { &$sub($expat, $type); } else { print; } } sub End { no strict 'refs'; my $expat = shift; my $type = shift; # Set right context for Text handler push(@{$expat->{Context}}, $type); doText($expat); pop(@{$expat->{Context}}); $_ = ""; my $sub = $expat->{Pkg} . "::EndTag"; if (defined(&$sub)) { &$sub($expat, $type); } else { print; } } sub Char { my $expat = shift; $expat->{Text} .= shift; } sub Proc { no strict 'refs'; my $expat = shift; my $target = shift; my $text = shift; $_ = ""; my $sub = $expat->{Pkg} . "::PI"; if (defined(&$sub)) { &$sub($expat, $target, $text); } else { print; } } sub Final { no strict 'refs'; my $expat = shift; my $sub = $expat->{Pkg} . "::EndDocument"; &$sub($expat) if defined(&$sub); } sub doText { no strict 'refs'; my $expat = shift; $_ = $expat->{Text}; if (length($_)) { my $sub = $expat->{Pkg} . "::Text"; if (defined(&$sub)) { &$sub($expat); } else { print; } $expat->{Text} = ''; } } 1; __END__ =head1 NAME XML::Parser - A perl module for parsing XML documents =head1 SYNOPSIS use XML::Parser; $p1 = new XML::Parser(Style => 'Debug'); $p1->parsefile('REC-xml-19980210.xml'); $p1->parse('Hello World'); # Alternative $p2 = new XML::Parser(Handlers => {Start => \&handle_start, End => \&handle_end, Char => \&handle_char}); $p2->parse($socket); # Another alternative $p3 = new XML::Parser(ErrorContext => 2); $p3->setHandlers(Char => \&text, Default => \&other); open(FOO, 'xmlgenerator |'); $p3->parse(*FOO, ProtocolEncoding => 'ISO-8859-1'); close(FOO); $p3->parsefile('junk.xml', ErrorContext => 3); =begin man .ds PI PI =end man =head1 DESCRIPTION This module provides ways to parse XML documents. It is built on top of L, which is a lower level interface to James Clark's expat library. Each call to one of the parsing methods creates a new instance of XML::Parser::Expat which is then used to parse the document. Expat options may be provided when the XML::Parser object is created. These options are then passed on to the Expat object on each parse call. They can also be given as extra arguments to the parse methods, in which case they override options given at XML::Parser creation time. The behavior of the parser is controlled either by C> and/or C> options, or by L method. These all provide mechanisms for XML::Parser to set the handlers needed by XML::Parser::Expat. If neither C