250 lines
7.2 KiB
Perl
250 lines
7.2 KiB
Perl
package HTML::Entities;
|
||
|
||
# $Id: Entities.pm,v 1.13 1998/03/26 21:19:05 aas Exp $
|
||
|
||
=head1 NAME
|
||
|
||
HTML::Entities - Encode or decode strings with HTML entities
|
||
|
||
=head1 SYNOPSIS
|
||
|
||
use HTML::Entities;
|
||
|
||
$a = "Våre norske tegn bør æres";
|
||
decode_entities($a);
|
||
encode_entities($a, "\200-\377");
|
||
|
||
=head1 DESCRIPTION
|
||
|
||
This module deals with encoding and decoding of strings with HTML
|
||
character entites. The module provide the following functions:
|
||
|
||
=over 4
|
||
|
||
=item decode_entities($string)
|
||
|
||
This routine replaces HTML entities found in the $string with the
|
||
corresponding ISO-8859/1 character. Unrecognized entities are left
|
||
alone.
|
||
|
||
=item endode_entities($string, [$unsafe_chars])
|
||
|
||
This routine replaces unsafe characters in $string with their entity
|
||
representation. A second argument can be given to specify which
|
||
characters to concider as unsafe. The default set of characters to
|
||
expand are control chars, high-bit chars and the '<', '&', '>' and '"'
|
||
character.
|
||
|
||
=back
|
||
|
||
Both routines modify the string passed in as the first argument if
|
||
called in void context. In scalar and array context the encoded or
|
||
decoded string is returned (and the argument string is left
|
||
unchanged).
|
||
|
||
If you prefer not to import these routines into your namespace you can
|
||
call them as:
|
||
|
||
use HTML::Entities ();
|
||
$encoded = HTML::Entities::encode($a);
|
||
$decoded = HTML::Entities::decode($a);
|
||
|
||
The module can also export the %char2entity and the %entity2char
|
||
hashes which contains the mapping from all characters to the
|
||
corresponding entities.
|
||
|
||
=head1 COPYRIGHT
|
||
|
||
Copyright 1995-1998 Gisle Aas. All rights reserved.
|
||
|
||
This library is free software; you can redistribute it and/or
|
||
modify it under the same terms as Perl itself.
|
||
|
||
=cut
|
||
|
||
use strict;
|
||
use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
|
||
use vars qw(%entity2char %char2entity);
|
||
|
||
require 5.004;
|
||
require Exporter;
|
||
@ISA = qw(Exporter);
|
||
|
||
@EXPORT = qw(encode_entities decode_entities);
|
||
@EXPORT_OK = qw(%entity2char %char2entity);
|
||
|
||
$VERSION = sprintf("%d.%02d", q$Revision: 1.13 $ =~ /(\d+)\.(\d+)/);
|
||
sub Version { $VERSION; }
|
||
|
||
|
||
%entity2char = (
|
||
# Some normal chars that have special meaning in SGML context
|
||
amp => '&', # ampersand
|
||
'gt' => '>', # greater than
|
||
'lt' => '<', # less than
|
||
quot => '"', # double quote
|
||
|
||
# PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
|
||
AElig => '<27>', # capital AE diphthong (ligature)
|
||
Aacute => '<27>', # capital A, acute accent
|
||
Acirc => '<27>', # capital A, circumflex accent
|
||
Agrave => '<27>', # capital A, grave accent
|
||
Aring => '<27>', # capital A, ring
|
||
Atilde => '<27>', # capital A, tilde
|
||
Auml => '<27>', # capital A, dieresis or umlaut mark
|
||
Ccedil => '<27>', # capital C, cedilla
|
||
ETH => '<27>', # capital Eth, Icelandic
|
||
Eacute => '<27>', # capital E, acute accent
|
||
Ecirc => '<27>', # capital E, circumflex accent
|
||
Egrave => '<27>', # capital E, grave accent
|
||
Euml => '<27>', # capital E, dieresis or umlaut mark
|
||
Iacute => '<27>', # capital I, acute accent
|
||
Icirc => '<27>', # capital I, circumflex accent
|
||
Igrave => '<27>', # capital I, grave accent
|
||
Iuml => '<27>', # capital I, dieresis or umlaut mark
|
||
Ntilde => '<27>', # capital N, tilde
|
||
Oacute => '<27>', # capital O, acute accent
|
||
Ocirc => '<27>', # capital O, circumflex accent
|
||
Ograve => '<27>', # capital O, grave accent
|
||
Oslash => '<27>', # capital O, slash
|
||
Otilde => '<27>', # capital O, tilde
|
||
Ouml => '<27>', # capital O, dieresis or umlaut mark
|
||
THORN => '<27>', # capital THORN, Icelandic
|
||
Uacute => '<27>', # capital U, acute accent
|
||
Ucirc => '<27>', # capital U, circumflex accent
|
||
Ugrave => '<27>', # capital U, grave accent
|
||
Uuml => '<27>', # capital U, dieresis or umlaut mark
|
||
Yacute => '<27>', # capital Y, acute accent
|
||
aacute => '<27>', # small a, acute accent
|
||
acirc => '<27>', # small a, circumflex accent
|
||
aelig => '<27>', # small ae diphthong (ligature)
|
||
agrave => '<27>', # small a, grave accent
|
||
aring => '<27>', # small a, ring
|
||
atilde => '<27>', # small a, tilde
|
||
auml => '<27>', # small a, dieresis or umlaut mark
|
||
ccedil => '<27>', # small c, cedilla
|
||
eacute => '<27>', # small e, acute accent
|
||
ecirc => '<27>', # small e, circumflex accent
|
||
egrave => '<27>', # small e, grave accent
|
||
eth => '<27>', # small eth, Icelandic
|
||
euml => '<27>', # small e, dieresis or umlaut mark
|
||
iacute => '<27>', # small i, acute accent
|
||
icirc => '<27>', # small i, circumflex accent
|
||
igrave => '<27>', # small i, grave accent
|
||
iuml => '<27>', # small i, dieresis or umlaut mark
|
||
ntilde => '<27>', # small n, tilde
|
||
oacute => '<27>', # small o, acute accent
|
||
ocirc => '<27>', # small o, circumflex accent
|
||
ograve => '<27>', # small o, grave accent
|
||
oslash => '<27>', # small o, slash
|
||
otilde => '<27>', # small o, tilde
|
||
ouml => '<27>', # small o, dieresis or umlaut mark
|
||
szlig => '<27>', # small sharp s, German (sz ligature)
|
||
thorn => '<27>', # small thorn, Icelandic
|
||
uacute => '<27>', # small u, acute accent
|
||
ucirc => '<27>', # small u, circumflex accent
|
||
ugrave => '<27>', # small u, grave accent
|
||
uuml => '<27>', # small u, dieresis or umlaut mark
|
||
yacute => '<27>', # small y, acute accent
|
||
yuml => '<27>', # small y, dieresis or umlaut mark
|
||
|
||
# Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
|
||
copy => '<27>', # copyright sign
|
||
reg => '<27>', # registered sign
|
||
nbsp => "\240", # non breaking space
|
||
|
||
# Additional ISO-8859/1 entities listed in rfc1866 (section 14)
|
||
iexcl => '<27>',
|
||
cent => '<27>',
|
||
pound => '<27>',
|
||
curren => '<27>',
|
||
yen => '<27>',
|
||
brvbar => '<27>',
|
||
sect => '<27>',
|
||
uml => '<27>',
|
||
ordf => '<27>',
|
||
laquo => '<27>',
|
||
'not' => '<27>', # not is a keyword in perl
|
||
shy => '<27>',
|
||
macr => '<27>',
|
||
deg => '<27>',
|
||
plusmn => '<27>',
|
||
sup1 => '<27>',
|
||
sup2 => '<27>',
|
||
sup3 => '<27>',
|
||
acute => '<27>',
|
||
micro => '<27>',
|
||
para => '<27>',
|
||
middot => '<27>',
|
||
cedil => '<27>',
|
||
ordm => '<27>',
|
||
raquo => '<27>',
|
||
frac14 => '<27>',
|
||
frac12 => '<27>',
|
||
frac34 => '<27>',
|
||
iquest => '<27>',
|
||
'times' => '<27>', # times is a keyword in perl
|
||
divide => '<27>',
|
||
);
|
||
|
||
# Make the oposite mapping
|
||
while (my($entity, $char) = each(%entity2char)) {
|
||
$char2entity{$char} = "&$entity;";
|
||
}
|
||
|
||
# Fill inn missing entities
|
||
for (0 .. 255) {
|
||
next if exists $char2entity{chr($_)};
|
||
$char2entity{chr($_)} = "&#$_;";
|
||
}
|
||
|
||
my %subst; # compiled encoding regexps
|
||
|
||
|
||
sub decode_entities
|
||
{
|
||
my $array;
|
||
if (defined wantarray) {
|
||
$array = [@_]; # copy
|
||
} else {
|
||
$array = \@_; # modify in-place
|
||
}
|
||
my $c;
|
||
for (@$array) {
|
||
s/(&\#(\d+);?)/$2 < 256 ? chr($2) : $1/eg;
|
||
s/(&\#[xX]([0-9a-fA-F]+);?)/$c = hex($2); $c < 256 ? chr($c) : $1/eg;
|
||
s/(&(\w+);?)/$entity2char{$2} || $1/eg;
|
||
}
|
||
wantarray ? @$array : $array->[0];
|
||
}
|
||
|
||
sub encode_entities
|
||
{
|
||
my $ref;
|
||
if (defined wantarray) {
|
||
my $x = $_[0];
|
||
$ref = \$x; # copy
|
||
} else {
|
||
$ref = \$_[0]; # modify in-place
|
||
}
|
||
if (defined $_[1]) {
|
||
unless (exists $subst{$_[1]}) {
|
||
# Because we can't compile regex we fake it with a cached sub
|
||
$subst{$_[1]} =
|
||
eval "sub {\$_[0] =~ s/([$_[1]])/\$char2entity{\$1}/g; }";
|
||
die $@ if $@;
|
||
}
|
||
&{$subst{$_[1]}}($$ref);
|
||
} else {
|
||
# Encode control chars, high bit chars and '<', '&', '>', '"'
|
||
$$ref =~ s/([^\n\t !\#\$%\'-;=?-~])/$char2entity{$1}/g;
|
||
}
|
||
$$ref;
|
||
}
|
||
|
||
# Set up aliases
|
||
*encode = \&encode_entities;
|
||
*decode = \&decode_entities;
|
||
|
||
1;
|