250 lines
		
	
	
		
			7.2 KiB
		
	
	
	
		
			Perl
		
	
	
	
	
	
			
		
		
	
	
			250 lines
		
	
	
		
			7.2 KiB
		
	
	
	
		
			Perl
		
	
	
	
	
	
package HTML::Entities;
 | 
						||
 | 
						||
# $Id: Entities.pm,v 1.13 1998/03/26 21:19:05 aas Exp $
 | 
						||
 | 
						||
=head1 NAME
 | 
						||
 | 
						||
HTML::Entities - Encode or decode strings with HTML entities
 | 
						||
 | 
						||
=head1 SYNOPSIS
 | 
						||
 | 
						||
 use HTML::Entities;
 | 
						||
 | 
						||
 $a = "Våre norske tegn bør æres";
 | 
						||
 decode_entities($a);
 | 
						||
 encode_entities($a, "\200-\377");
 | 
						||
 | 
						||
=head1 DESCRIPTION
 | 
						||
 | 
						||
This module deals with encoding and decoding of strings with HTML
 | 
						||
character entites.  The module provide the following functions:
 | 
						||
 | 
						||
=over 4
 | 
						||
 | 
						||
=item decode_entities($string)
 | 
						||
 | 
						||
This routine replaces HTML entities found in the $string with the
 | 
						||
corresponding ISO-8859/1 character.  Unrecognized entities are left
 | 
						||
alone.
 | 
						||
 | 
						||
=item endode_entities($string, [$unsafe_chars])
 | 
						||
 | 
						||
This routine replaces unsafe characters in $string with their entity
 | 
						||
representation.  A second argument can be given to specify which
 | 
						||
characters to concider as unsafe.  The default set of characters to
 | 
						||
expand are control chars, high-bit chars and the '<', '&', '>' and '"'
 | 
						||
character.
 | 
						||
 | 
						||
=back
 | 
						||
 | 
						||
Both routines modify the string passed in as the first argument if
 | 
						||
called in void context.  In scalar and array context the encoded or
 | 
						||
decoded string is returned (and the argument string is left
 | 
						||
unchanged).
 | 
						||
 | 
						||
If you prefer not to import these routines into your namespace you can
 | 
						||
call them as:
 | 
						||
 | 
						||
  use HTML::Entities ();
 | 
						||
  $encoded = HTML::Entities::encode($a);
 | 
						||
  $decoded = HTML::Entities::decode($a);
 | 
						||
 | 
						||
The module can also export the %char2entity and the %entity2char
 | 
						||
hashes which contains the mapping from all characters to the
 | 
						||
corresponding entities.
 | 
						||
 | 
						||
=head1 COPYRIGHT
 | 
						||
 | 
						||
Copyright 1995-1998 Gisle Aas. All rights reserved.
 | 
						||
 | 
						||
This library is free software; you can redistribute it and/or
 | 
						||
modify it under the same terms as Perl itself.
 | 
						||
 | 
						||
=cut
 | 
						||
 | 
						||
use strict;
 | 
						||
use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
 | 
						||
use vars qw(%entity2char %char2entity);
 | 
						||
 | 
						||
require 5.004;
 | 
						||
require Exporter;
 | 
						||
@ISA = qw(Exporter);
 | 
						||
 | 
						||
@EXPORT = qw(encode_entities decode_entities);
 | 
						||
@EXPORT_OK = qw(%entity2char %char2entity);
 | 
						||
 | 
						||
$VERSION = sprintf("%d.%02d", q$Revision: 1.13 $ =~ /(\d+)\.(\d+)/);
 | 
						||
sub Version { $VERSION; }
 | 
						||
 | 
						||
 | 
						||
%entity2char = (
 | 
						||
 # Some normal chars that have special meaning in SGML context
 | 
						||
 amp    => '&',  # ampersand 
 | 
						||
'gt'    => '>',  # greater than
 | 
						||
'lt'    => '<',  # less than
 | 
						||
 quot   => '"',  # double quote
 | 
						||
 | 
						||
 # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
 | 
						||
 AElig	=> '<27>',  # capital AE diphthong (ligature)
 | 
						||
 Aacute	=> '<27>',  # capital A, acute accent
 | 
						||
 Acirc	=> '<27>',  # capital A, circumflex accent
 | 
						||
 Agrave	=> '<27>',  # capital A, grave accent
 | 
						||
 Aring	=> '<27>',  # capital A, ring
 | 
						||
 Atilde	=> '<27>',  # capital A, tilde
 | 
						||
 Auml	=> '<27>',  # capital A, dieresis or umlaut mark
 | 
						||
 Ccedil	=> '<27>',  # capital C, cedilla
 | 
						||
 ETH	=> '<27>',  # capital Eth, Icelandic
 | 
						||
 Eacute	=> '<27>',  # capital E, acute accent
 | 
						||
 Ecirc	=> '<27>',  # capital E, circumflex accent
 | 
						||
 Egrave	=> '<27>',  # capital E, grave accent
 | 
						||
 Euml	=> '<27>',  # capital E, dieresis or umlaut mark
 | 
						||
 Iacute	=> '<27>',  # capital I, acute accent
 | 
						||
 Icirc	=> '<27>',  # capital I, circumflex accent
 | 
						||
 Igrave	=> '<27>',  # capital I, grave accent
 | 
						||
 Iuml	=> '<27>',  # capital I, dieresis or umlaut mark
 | 
						||
 Ntilde	=> '<27>',  # capital N, tilde
 | 
						||
 Oacute	=> '<27>',  # capital O, acute accent
 | 
						||
 Ocirc	=> '<27>',  # capital O, circumflex accent
 | 
						||
 Ograve	=> '<27>',  # capital O, grave accent
 | 
						||
 Oslash	=> '<27>',  # capital O, slash
 | 
						||
 Otilde	=> '<27>',  # capital O, tilde
 | 
						||
 Ouml	=> '<27>',  # capital O, dieresis or umlaut mark
 | 
						||
 THORN	=> '<27>',  # capital THORN, Icelandic
 | 
						||
 Uacute	=> '<27>',  # capital U, acute accent
 | 
						||
 Ucirc	=> '<27>',  # capital U, circumflex accent
 | 
						||
 Ugrave	=> '<27>',  # capital U, grave accent
 | 
						||
 Uuml	=> '<27>',  # capital U, dieresis or umlaut mark
 | 
						||
 Yacute	=> '<27>',  # capital Y, acute accent
 | 
						||
 aacute	=> '<27>',  # small a, acute accent
 | 
						||
 acirc	=> '<27>',  # small a, circumflex accent
 | 
						||
 aelig	=> '<27>',  # small ae diphthong (ligature)
 | 
						||
 agrave	=> '<27>',  # small a, grave accent
 | 
						||
 aring	=> '<27>',  # small a, ring
 | 
						||
 atilde	=> '<27>',  # small a, tilde
 | 
						||
 auml	=> '<27>',  # small a, dieresis or umlaut mark
 | 
						||
 ccedil	=> '<27>',  # small c, cedilla
 | 
						||
 eacute	=> '<27>',  # small e, acute accent
 | 
						||
 ecirc	=> '<27>',  # small e, circumflex accent
 | 
						||
 egrave	=> '<27>',  # small e, grave accent
 | 
						||
 eth	=> '<27>',  # small eth, Icelandic
 | 
						||
 euml	=> '<27>',  # small e, dieresis or umlaut mark
 | 
						||
 iacute	=> '<27>',  # small i, acute accent
 | 
						||
 icirc	=> '<27>',  # small i, circumflex accent
 | 
						||
 igrave	=> '<27>',  # small i, grave accent
 | 
						||
 iuml	=> '<27>',  # small i, dieresis or umlaut mark
 | 
						||
 ntilde	=> '<27>',  # small n, tilde
 | 
						||
 oacute	=> '<27>',  # small o, acute accent
 | 
						||
 ocirc	=> '<27>',  # small o, circumflex accent
 | 
						||
 ograve	=> '<27>',  # small o, grave accent
 | 
						||
 oslash	=> '<27>',  # small o, slash
 | 
						||
 otilde	=> '<27>',  # small o, tilde
 | 
						||
 ouml	=> '<27>',  # small o, dieresis or umlaut mark
 | 
						||
 szlig	=> '<27>',  # small sharp s, German (sz ligature)
 | 
						||
 thorn	=> '<27>',  # small thorn, Icelandic
 | 
						||
 uacute	=> '<27>',  # small u, acute accent
 | 
						||
 ucirc	=> '<27>',  # small u, circumflex accent
 | 
						||
 ugrave	=> '<27>',  # small u, grave accent
 | 
						||
 uuml	=> '<27>',  # small u, dieresis or umlaut mark
 | 
						||
 yacute	=> '<27>',  # small y, acute accent
 | 
						||
 yuml	=> '<27>',  # small y, dieresis or umlaut mark
 | 
						||
 | 
						||
 # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
 | 
						||
 copy   => '<27>',  # copyright sign
 | 
						||
 reg    => '<27>',  # registered sign
 | 
						||
 nbsp   => "\240", # non breaking space
 | 
						||
 | 
						||
 # Additional ISO-8859/1 entities listed in rfc1866 (section 14)
 | 
						||
 iexcl  => '<27>',
 | 
						||
 cent   => '<27>',
 | 
						||
 pound  => '<27>',
 | 
						||
 curren => '<27>',
 | 
						||
 yen    => '<27>',
 | 
						||
 brvbar => '<27>',
 | 
						||
 sect   => '<27>',
 | 
						||
 uml    => '<27>',
 | 
						||
 ordf   => '<27>',
 | 
						||
 laquo  => '<27>',
 | 
						||
'not'   => '<27>',    # not is a keyword in perl
 | 
						||
 shy    => '<27>',
 | 
						||
 macr   => '<27>',
 | 
						||
 deg    => '<27>',
 | 
						||
 plusmn => '<27>',
 | 
						||
 sup1   => '<27>',
 | 
						||
 sup2   => '<27>',
 | 
						||
 sup3   => '<27>',
 | 
						||
 acute  => '<27>',
 | 
						||
 micro  => '<27>',
 | 
						||
 para   => '<27>',
 | 
						||
 middot => '<27>',
 | 
						||
 cedil  => '<27>',
 | 
						||
 ordm   => '<27>',
 | 
						||
 raquo  => '<27>',
 | 
						||
 frac14 => '<27>',
 | 
						||
 frac12 => '<27>',
 | 
						||
 frac34 => '<27>',
 | 
						||
 iquest => '<27>',
 | 
						||
'times' => '<27>',    # times is a keyword in perl
 | 
						||
 divide => '<27>',
 | 
						||
);
 | 
						||
 | 
						||
# Make the oposite mapping
 | 
						||
while (my($entity, $char) = each(%entity2char)) {
 | 
						||
    $char2entity{$char} = "&$entity;";
 | 
						||
}
 | 
						||
 | 
						||
# Fill inn missing entities
 | 
						||
for (0 .. 255) {
 | 
						||
    next if exists $char2entity{chr($_)};
 | 
						||
    $char2entity{chr($_)} = "&#$_;";
 | 
						||
}
 | 
						||
 | 
						||
my %subst;  # compiled encoding regexps
 | 
						||
 | 
						||
 | 
						||
sub decode_entities
 | 
						||
{
 | 
						||
    my $array;
 | 
						||
    if (defined wantarray) {
 | 
						||
	$array = [@_]; # copy
 | 
						||
    } else {
 | 
						||
	$array = \@_;  # modify in-place
 | 
						||
    }
 | 
						||
    my $c;
 | 
						||
    for (@$array) {
 | 
						||
	s/(&\#(\d+);?)/$2 < 256 ? chr($2) : $1/eg;
 | 
						||
	s/(&\#[xX]([0-9a-fA-F]+);?)/$c = hex($2); $c < 256 ? chr($c) : $1/eg;
 | 
						||
	s/(&(\w+);?)/$entity2char{$2} || $1/eg;
 | 
						||
    }
 | 
						||
    wantarray ? @$array : $array->[0];
 | 
						||
}
 | 
						||
 | 
						||
sub encode_entities
 | 
						||
{
 | 
						||
    my $ref;
 | 
						||
    if (defined wantarray) {
 | 
						||
	my $x = $_[0];
 | 
						||
	$ref = \$x;     # copy
 | 
						||
    } else {
 | 
						||
	$ref = \$_[0];  # modify in-place
 | 
						||
    }
 | 
						||
    if (defined $_[1]) {
 | 
						||
	unless (exists $subst{$_[1]}) {
 | 
						||
	    # Because we can't compile regex we fake it with a cached sub
 | 
						||
	    $subst{$_[1]} =
 | 
						||
	      eval "sub {\$_[0] =~ s/([$_[1]])/\$char2entity{\$1}/g; }";
 | 
						||
	    die $@ if $@;
 | 
						||
	}
 | 
						||
	&{$subst{$_[1]}}($$ref);
 | 
						||
    } else {
 | 
						||
	# Encode control chars, high bit chars and '<', '&', '>', '"'
 | 
						||
	$$ref =~ s/([^\n\t !\#\$%\'-;=?-~])/$char2entity{$1}/g;
 | 
						||
    }
 | 
						||
    $$ref;
 | 
						||
}
 | 
						||
 | 
						||
# Set up aliases
 | 
						||
*encode = \&encode_entities;
 | 
						||
*decode = \&decode_entities;
 | 
						||
 | 
						||
1;
 |