usr/share/texinfo/lib/libintl-perl/lib/Locale/Recode.pm - toolchains/quantenna - Git at Google

 #! /bin/false

 # vim: set autoindent shiftwidth=4 tabstop=4:
 # $Id: Recode.pm,v 1.1 2011-10-12 23:51:26 pertusus Exp $

 # Portable character conversion for Perl.
 # Copyright (C) 2002-2009 Guido Flohr <guido@imperia.net>,
 # all rights reserved.

 # This program is free software; you can redistribute it and/or modify it
 # under the terms of the GNU Library General Public License as published
 # by the Free Software Foundation; either version 2, or (at your option)
 # any later version.

 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # Library General Public License for more details.

 # You should have received a copy of the GNU Library General Public
 # License along with this program; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
 # USA.

 package Locale::Recode;

 use strict;

 require Locale::Recode::_Conversions;

 my $loaded = {};
 my $has_encode;

 sub new
 {
     my $class = ref($_[0]) || $_[0];
     shift;
     my %args = @_;

     my $self = bless {}, $class;

     my ($from_codeset, $to_codeset) = @args{qw (from to)};

     unless ($from_codeset && $to_codeset) {
 		require Carp;
         Carp::croak (<<EOF);
 	Usage: $class->new (from => FROM_CODESET, to => TO_CODESET);
 EOF
     }

     # Find a conversion path.
 	my $path = Locale::Recode::_Conversions->findPath ($from_codeset,
 													   $to_codeset);
 	unless ($path) {
 		$self->{__error} = 'EINVAL';
 		return $self;
 	}

 	my @conversions = ();
 	foreach (@$path) {
 		my ($module, $from, $to) = @$_;

 		unless ($loaded->{$module}) {
 			eval "require Locale::RecodeData::$module";
 			if ($@) {
 				$self->{__error} = $@;
 				return $self;
 			}

 			$loaded->{$module} = 1;
 		}

 		my $module_name = "Locale::RecodeData::$module";
 		my $method = 'new';
 		my $object = $module_name->$method (from => $from,
 											to => $to);

 		push @conversions, $object;
 	}

 	$self->{__conversions} = \@conversions;

     return $self;
 }

 sub resolveAlias
 {
 	my ($class, $alias) = @_;

 	return Locale::Recode::_Conversions->resolveAlias ($alias);
 }

 sub getSupported
 {
 	return [ Locale::Recode::_Conversions->listSupported ];
 }

 sub getCharsets
 {
 	my $self = shift;
 	my %all = map { $_ => 1 } @{&getSupported};

 	require Locale::Recode::_Aliases;

 	my $conversions = Locale::Recode::_Conversions->listSupported;
 	foreach my $charset (keys %{Locale::Recode::_Aliases::ALIASES()}) {
 		my $mime_name = $self->resolveAlias ($charset);
 		next unless exists $all{$mime_name};
 		$all{$charset} = 1;
 	}

 	return [ keys %all ];
 }

 sub recode
 {
     my $self = $_[0];

     return if $self->{__error};

     return 1 unless defined $_[1];

     my $chain = $self->{__conversions};

     foreach my $module (@$chain) {
 		my $success = $module->_recode ($_[1]);

 		unless ($success) {
 			$self->{__error} = $module->_getError;
 			return;
 		}
     }

     return 1;
 }

 sub getError
 {
     my $self = shift;
     my $error = $self->{__error} or return;

     if ('EINVAL' eq $error) {
 		return 'Invalid conversion';
     } else {
 		return $error;
     }
 }

 1;

 __END__

 =head1 NAME

 Locale::Recode - Object-Oriented Portable Charset Conversion

 =head1 SYNOPSIS

   use Locale::Recode;

   $cd = Locale::Recode->new (from => 'UTF-8',
                              to   => 'ISO-8859-1');

   die $cd->getError if $cd->getError;

   $cd->recode ($text) or die $cd->getError;

   $mime_name = Locale::Recode->resolveAlias ('latin-1');

   $supported = Locale::Recode->getSupported;

   $complete = Locale::Recode->getCharsets;

 =head1 DESCRIPTION

 This module provides routines that convert textual data from one
 codeset to another in a portable way.  The module has been started
 before Encode(3) was written.  It's main purpose today is to provide
 charset conversion even when Encode(3) is not available on the system.
 It should also work for older Perl versions without Unicode support.

 Internally Locale::Recode(3) will use Encode(3) whenever possible,
 to allow for a faster conversion and for a wider range of supported
 charsets, and will only fall back to the Perl implementation when
 Encode(3) is not available or does not support a particular charset
 that Locale::Recode(3) does.

 Locale::Recode(3) is part of libintl-perl, and it's main purpose is
 actually to implement a portable charset conversion framework for
 the message translation facilities described in Locale::TextDomain(3).

 =head1 CONSTRUCTOR

 The constructor C<new()> requires two named arguments:

 =over 4

 =item B<from>

 The encoding of the original data.  Case doesn't matter, aliases
 are resolved.

 =item B<to>

 The target encoding.  Again, case doesn't matter, and aliases
 are resolved.

 =back

 The constructor will never fail.  In case of an error, the object's
 internal state is set to bad and it will refuse to do any conversions.
 You can inquire the reason for the failure with the method
 getError().

 =head1 OBJECT METHODS

 The following object methods are available.

 =over 4

 =item B<recode (STRING)>

 Converts B<STRING> from the source encoding into the destination
 encoding.  In case of success, a truth value is returned, false
 otherwise.  You can inquire the reason for the failure with the
 method getError().

 =item B<getError>

 Returns either false if the object is not in an error state or
 an error message.

 =back

 =head1 CLASS METHODS

 The object provides some additional class methods:

 =over 4

 =item B<getSupported>

 Returns a reference to a list of all supported charsets.  This
 may implicitely load additional Encode(3) conversions like
 Encode::HanExtra(3) which may produce considerable load on your
 system.

 The method is therefore not intended for regular use but rather
 for getting resp. displaying I<once> a list of available encodings.

 The members of the list are all converted to uppercase!

 =item B<getCharsets>

 Like getSupported() but also returns all available aliases.

 =back

 =head1 SUPPORTED CHARSETS

 The range of supported charsets is system-dependent.  The following
 somewhat special charsets are always available:

 =over 4

 =item B<UTF-8>

 UTF-8 is available independently of your Perl version.  For Perl 5.6
 or better or in the presence of Encode(3), conversions are not done
 in Perl but with the interfaces provided by these facilities which
 are written in C, hence much faster.

 Encoding data I<into> UTF-8 is fast, even if it is done in Perl.
 Decoding it in Perl may become quite slow.  If you frequently have
 to decode UTF-8 with B<Locale::Recode> you will probably want to
 make sure that you do that with Perl 5.6 or beter, or install Encode(3) to
 speed up things.

 =item B<INTERNAL>

 UTF-8 is fast to write but hard to read for applications.  It is
 therefore not the worst for internal string representation but not
 far from that.  Locale::Recode(3) stores strings internally as a
 reference to an array of integer values like most programming languages
 (Perl is an exception) do, trading memory for performance.

 The integer values are the UCS-4 codes of the characters in host
 byte order.

 The encoding B<INTERNAL> is directly availabe via Locale::Recode(3)
 but of course you should not really use it for data exchange, unless
 you know what you are doing.

 =back

 Locale::Recode(3) has native support for a plethora of other encodings,
 most of them 8 bit encodings that are fast to decode, including most
 encodings used on popular micros like the ISO-8859-* series of encodings,
 most Windows-* encodings (also known as CP*), Macintosh, Atari, etc.

 =head1 NAMES AND ALIASES

 Each charset resp. encoding is available internally under a unique
 name.  Whenever the information was available, the preferred MIME name
 (see L<http://www.iana.org/assignments/character-sets/>) was chosen as
 the internal name.

 Alias handling is quite strict.  The module does not make wild guesses
 at what you mean ("What's the meaning of the acronym JIS" is a valid
 alias for "7bit-jis" in Encode(3) ....) but aims at providing common
 aliases only.  The same applies to so-called aliases that are really
 mistakes, like "utf8" for UTF-8.

 The module knows all aliases that are listed with the IANA character
 set registry (L<http://www.iana.org/assignments/character-sets/>), plus
 those known to libiconv version 1.8, and a bunch of additional ones.

 =head1 CONVERSION TABLES

 The conversion tables have either been taken from official sources
 like the IANA or the Unicode Consortium, from Bruno Haible's libiconv,
 or from the sources of the GNU libc and the regression tests for
 libintl-perl will check for conformance here.  For some encodings this data
 differs from Encode(3)'s data which would cause these tests to fail.
 In these cases, the module will not invoke the Encode(3) methods, but
 will fall back to the internal implementation for the sake of
 consistency.

 The few encodings that are affected are so simple that you will not
 experience any real performance penalty unless you convert large chunks
 of data.  But the package is not really intended for such use anyway, and
 since Encode(3) is relatively new, I rather think that the differences
 are bugs in Encode which will be fixed soon.

 =head1 BUGS

 The module should provide fall back conversions for other Unicode
 encoding schemes like UCS-2, UCS-4 (big- and little-endian).

 The pure Perl UTF-8 decoder will not always handle corrupt UTF-8
 correctly, especially at the end and at the beginning of the string.
 This is not likely to be fixed, since the module's intention is not
 to be a consistency checker for UTF-8 data.

 =head1 AUTHOR

 Copyright (C) 2002-2009, Guido Flohr E<lt>guido@imperia.netE<gt>, all
 rights reserved.  See the source code for details.

 This software is contributed to the Perl community by Imperia
 (L<http://www.imperia.net/>).

 =head1 SEE ALSO

 Encode(3), iconv(3), iconv(1), recode(1), perl(1)

 =cut
 Local Variables:
 mode: perl
 perl-indent-level: 4
 perl-continued-statement-offset: 4
 perl-continued-brace-offset: 0
 perl-brace-offset: -4
 perl-brace-imaginary-offset: 0
 perl-label-offset: -4
 cperl-indent-level: 4
 cperl-continued-statement-offset: 2
 tab-width: 4
 End:
 =cut
	#! /bin/false

	# vim: set autoindent shiftwidth=4 tabstop=4:
	# $Id: Recode.pm,v 1.1 2011-10-12 23:51:26 pertusus Exp $

	# Portable character conversion for Perl.
	# Copyright (C) 2002-2009 Guido Flohr <guido@imperia.net>,
	# all rights reserved.

	# This program is free software; you can redistribute it and/or modify it
	# under the terms of the GNU Library General Public License as published
	# by the Free Software Foundation; either version 2, or (at your option)
	# any later version.

	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	# Library General Public License for more details.

	# You should have received a copy of the GNU Library General Public
	# License along with this program; if not, write to the Free Software
	# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
	# USA.

	package Locale::Recode;

	use strict;

	require Locale::Recode::_Conversions;

	my $loaded = {};
	my $has_encode;

	sub new
	{
	my $class = ref($_[0]) \|\| $_[0];
	shift;
	my %args = @_;

	my $self = bless {}, $class;

	my ($from_codeset, $to_codeset) = @args{qw (from to)};

	unless ($from_codeset && $to_codeset) {
	require Carp;
	Carp::croak (<<EOF);
	Usage: $class->new (from => FROM_CODESET, to => TO_CODESET);
	EOF
	}

	# Find a conversion path.
	my $path = Locale::Recode::_Conversions->findPath ($from_codeset,
	$to_codeset);
	unless ($path) {
	$self->{__error} = 'EINVAL';
	return $self;
	}

	my @conversions = ();
	foreach (@$path) {
	my ($module, $from, $to) = @$_;

	unless ($loaded->{$module}) {
	eval "require Locale::RecodeData::$module";
	if ($@) {
	$self->{__error} = $@;
	return $self;
	}

	$loaded->{$module} = 1;
	}

	my $module_name = "Locale::RecodeData::$module";
	my $method = 'new';
	my $object = $module_name->$method (from => $from,
	to => $to);

	push @conversions, $object;
	}

	$self->{__conversions} = \@conversions;

	return $self;
	}

	sub resolveAlias
	{
	my ($class, $alias) = @_;

	return Locale::Recode::_Conversions->resolveAlias ($alias);
	}

	sub getSupported
	{
	return [ Locale::Recode::_Conversions->listSupported ];
	}

	sub getCharsets
	{
	my $self = shift;
	my %all = map { $_ => 1 } @{&getSupported};

	require Locale::Recode::_Aliases;

	my $conversions = Locale::Recode::_Conversions->listSupported;
	foreach my $charset (keys %{Locale::Recode::_Aliases::ALIASES()}) {
	my $mime_name = $self->resolveAlias ($charset);
	next unless exists $all{$mime_name};
	$all{$charset} = 1;
	}

	return [ keys %all ];
	}

	sub recode
	{
	my $self = $_[0];

	return if $self->{__error};

	return 1 unless defined $_[1];

	my $chain = $self->{__conversions};

	foreach my $module (@$chain) {
	my $success = $module->_recode ($_[1]);

	unless ($success) {
	$self->{__error} = $module->_getError;
	return;
	}
	}

	return 1;
	}

	sub getError
	{
	my $self = shift;
	my $error = $self->{__error} or return;

	if ('EINVAL' eq $error) {
	return 'Invalid conversion';
	} else {
	return $error;
	}
	}

	1;

	__END__

	=head1 NAME

	Locale::Recode - Object-Oriented Portable Charset Conversion

	=head1 SYNOPSIS

	use Locale::Recode;

	$cd = Locale::Recode->new (from => 'UTF-8',
	to => 'ISO-8859-1');

	die $cd->getError if $cd->getError;

	$cd->recode ($text) or die $cd->getError;

	$mime_name = Locale::Recode->resolveAlias ('latin-1');

	$supported = Locale::Recode->getSupported;

	$complete = Locale::Recode->getCharsets;

	=head1 DESCRIPTION

	This module provides routines that convert textual data from one
	codeset to another in a portable way. The module has been started
	before Encode(3) was written. It's main purpose today is to provide
	charset conversion even when Encode(3) is not available on the system.
	It should also work for older Perl versions without Unicode support.

	Internally Locale::Recode(3) will use Encode(3) whenever possible,
	to allow for a faster conversion and for a wider range of supported
	charsets, and will only fall back to the Perl implementation when
	Encode(3) is not available or does not support a particular charset
	that Locale::Recode(3) does.

	Locale::Recode(3) is part of libintl-perl, and it's main purpose is
	actually to implement a portable charset conversion framework for
	the message translation facilities described in Locale::TextDomain(3).

	=head1 CONSTRUCTOR

	The constructor C<new()> requires two named arguments:

	=over 4

	=item B<from>

	The encoding of the original data. Case doesn't matter, aliases
	are resolved.

	=item B<to>

	The target encoding. Again, case doesn't matter, and aliases
	are resolved.

	=back

	The constructor will never fail. In case of an error, the object's
	internal state is set to bad and it will refuse to do any conversions.
	You can inquire the reason for the failure with the method
	getError().

	=head1 OBJECT METHODS

	The following object methods are available.

	=over 4

	=item B<recode (STRING)>

	Converts B<STRING> from the source encoding into the destination
	encoding. In case of success, a truth value is returned, false
	otherwise. You can inquire the reason for the failure with the
	method getError().

	=item B<getError>

	Returns either false if the object is not in an error state or
	an error message.

	=back

	=head1 CLASS METHODS

	The object provides some additional class methods:

	=over 4

	=item B<getSupported>

	Returns a reference to a list of all supported charsets. This
	may implicitely load additional Encode(3) conversions like
	Encode::HanExtra(3) which may produce considerable load on your
	system.

	The method is therefore not intended for regular use but rather
	for getting resp. displaying I<once> a list of available encodings.

	The members of the list are all converted to uppercase!

	=item B<getCharsets>

	Like getSupported() but also returns all available aliases.

	=back

	=head1 SUPPORTED CHARSETS

	The range of supported charsets is system-dependent. The following
	somewhat special charsets are always available:

	=over 4

	=item B<UTF-8>

	UTF-8 is available independently of your Perl version. For Perl 5.6
	or better or in the presence of Encode(3), conversions are not done
	in Perl but with the interfaces provided by these facilities which
	are written in C, hence much faster.

	Encoding data I<into> UTF-8 is fast, even if it is done in Perl.
	Decoding it in Perl may become quite slow. If you frequently have
	to decode UTF-8 with B<Locale::Recode> you will probably want to
	make sure that you do that with Perl 5.6 or beter, or install Encode(3) to
	speed up things.

	=item B<INTERNAL>

	UTF-8 is fast to write but hard to read for applications. It is
	therefore not the worst for internal string representation but not
	far from that. Locale::Recode(3) stores strings internally as a
	reference to an array of integer values like most programming languages
	(Perl is an exception) do, trading memory for performance.

	The integer values are the UCS-4 codes of the characters in host
	byte order.

	The encoding B<INTERNAL> is directly availabe via Locale::Recode(3)
	but of course you should not really use it for data exchange, unless
	you know what you are doing.

	=back

	Locale::Recode(3) has native support for a plethora of other encodings,
	most of them 8 bit encodings that are fast to decode, including most
	encodings used on popular micros like the ISO-8859-* series of encodings,
	most Windows-* encodings (also known as CP*), Macintosh, Atari, etc.

	=head1 NAMES AND ALIASES

	Each charset resp. encoding is available internally under a unique
	name. Whenever the information was available, the preferred MIME name
	(see L<http://www.iana.org/assignments/character-sets/>) was chosen as
	the internal name.

	Alias handling is quite strict. The module does not make wild guesses
	at what you mean ("What's the meaning of the acronym JIS" is a valid
	alias for "7bit-jis" in Encode(3) ....) but aims at providing common
	aliases only. The same applies to so-called aliases that are really
	mistakes, like "utf8" for UTF-8.

	The module knows all aliases that are listed with the IANA character
	set registry (L<http://www.iana.org/assignments/character-sets/>), plus
	those known to libiconv version 1.8, and a bunch of additional ones.

	=head1 CONVERSION TABLES

	The conversion tables have either been taken from official sources
	like the IANA or the Unicode Consortium, from Bruno Haible's libiconv,
	or from the sources of the GNU libc and the regression tests for
	libintl-perl will check for conformance here. For some encodings this data
	differs from Encode(3)'s data which would cause these tests to fail.
	In these cases, the module will not invoke the Encode(3) methods, but
	will fall back to the internal implementation for the sake of
	consistency.

	The few encodings that are affected are so simple that you will not
	experience any real performance penalty unless you convert large chunks
	of data. But the package is not really intended for such use anyway, and
	since Encode(3) is relatively new, I rather think that the differences
	are bugs in Encode which will be fixed soon.

	=head1 BUGS

	The module should provide fall back conversions for other Unicode
	encoding schemes like UCS-2, UCS-4 (big- and little-endian).

	The pure Perl UTF-8 decoder will not always handle corrupt UTF-8
	correctly, especially at the end and at the beginning of the string.
	This is not likely to be fixed, since the module's intention is not
	to be a consistency checker for UTF-8 data.

	=head1 AUTHOR

	Copyright (C) 2002-2009, Guido Flohr E<lt>guido@imperia.netE<gt>, all
	rights reserved. See the source code for details.

	This software is contributed to the Perl community by Imperia
	(L<http://www.imperia.net/>).

	=head1 SEE ALSO

	Encode(3), iconv(3), iconv(1), recode(1), perl(1)

	=cut
	Local Variables:
	mode: perl
	perl-indent-level: 4
	perl-continued-statement-offset: 4
	perl-continued-brace-offset: 0
	perl-brace-offset: -4
	perl-brace-imaginary-offset: 0
	perl-label-offset: -4
	cperl-indent-level: 4
	cperl-continued-statement-offset: 2
	tab-width: 4
	End:
	=cut