blob: 6a5e92adfbac1b2d7bb5529f7c9cfd7d3a996162 [file] [log] [blame]
# Unicode.pm: handle conversion to unicode.
#
# Copyright 2010, 2011, 2012 Free Software Foundation, Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License,
# or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Original author: Patrice Dumas <pertusus@free.fr>
package Texinfo::Convert::Unicode;
# Seems to be the perl version required for Encode:
# http://cpansearch.perl.org/src/DANKOGAI/Encode-2.47/Encode/README.e2x
# http://coding.derkeiler.com/Archive/Perl/comp.lang.perl.misc/2005-12/msg00833.html
use 5.007_003;
use strict;
use Encode;
use Unicode::Normalize;
use Carp qw(cluck);
use Unicode::EastAsianWidth;
require Exporter;
use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
@ISA = qw(Exporter);
# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.
# This allows declaration use Texinfo::Convert::Unicode ':all';
# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
# will save memory.
%EXPORT_TAGS = ( 'all' => [ qw(
unicode_accent
encoded_accents
unicode_for_brace_no_arg_command
unicode_text
string_width
) ] );
@EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
@EXPORT = qw(
);
our %unicode_diacritics = (
'H' => '030B',
'ringaccent' => '030A',
"'" => '0301',
'v' => '030C',
',' => '0327',
'^' => '0302',
'dotaccent' => '0307',
'`' => '0300',
'=' => '0304',
'~' => '0303',
'"' => '0308',
'udotaccent' => '0323',
'ubaraccent' => '0332',
'u' => '0306',
'tieaccent' => '0361',
'ogonek' => '0328'
);
our %diacritics_accent_commands;
foreach my $diacritic(keys(%unicode_diacritics)) {
$diacritics_accent_commands{$unicode_diacritics{$diacritic}} = $diacritic;
}
our %unicode_accented_letters = (
'dotaccent' => { # dot above
'A' => '0226', #C moz-1.2
'a' => '0227', #c moz-1.2
'B' => '1E02',
'b' => '1E03',
'C' => '010A',
'c' => '010B',
'D' => '1E0A',
'd' => '1E0B',
'E' => '0116',
'e' => '0117',
'F' => '1E1E',
'f' => '1E1F',
'G' => '0120',
'g' => '0121',
'H' => '1E22',
'h' => '1E23',
'i' => '0069',
'I' => '0130',
'N' => '1E44',
'n' => '1E45',
'O' => '022E', #Y moz-1.2
'o' => '022F', #v moz-1.2
'P' => '1E56',
'p' => '1E57',
'R' => '1E58',
'r' => '1E59',
'S' => '1E60',
's' => '1E61',
'T' => '1E6A',
't' => '1E6B',
'W' => '1E86',
'w' => '1E87',
'X' => '1E8A',
'x' => '1E8B',
'Y' => '1E8E',
'y' => '1E8F',
'Z' => '017B',
'z' => '017C',
},
'udotaccent' => { # dot below
'A' => '1EA0',
'a' => '1EA1',
'B' => '1E04',
'b' => '1E05',
'D' => '1E0C',
'd' => '1E0D',
'E' => '1EB8',
'e' => '1EB9',
'H' => '1E24',
'h' => '1E25',
'I' => '1ECA',
'i' => '1ECB',
'K' => '1E32',
'k' => '1E33',
'L' => '1E36',
'l' => '1E37',
'M' => '1E42',
'm' => '1E43',
'N' => '1E46',
'n' => '1E47',
'O' => '1ECC',
'o' => '1ECD',
'R' => '1E5A',
'r' => '1E5B',
'S' => '1E62',
's' => '1E63',
'T' => '1E6C',
't' => '1E6D',
'U' => '1EE4',
'u' => '1EE5',
'V' => '1E7E',
'v' => '1E7F',
'W' => '1E88',
'w' => '1E89',
'Y' => '1EF4',
'y' => '1EF5',
'Z' => '1E92',
'z' => '1E93',
},
'ubaraccent' => { # line below
'B' => '1E06',
'b' => '1E07',
'D' => '1E0E',
'd' => '1E0F',
'h' => '1E96',
'K' => '1E34',
'k' => '1E35',
'L' => '1E3A',
'l' => '1E3B',
'N' => '1E48',
'n' => '1E49',
'R' => '1E5E',
'r' => '1E5F',
'T' => '1E6E',
't' => '1E6F',
'Z' => '1E94',
'z' => '1E95',
},
',' => { # cedilla
'C' => '00C7',
'c' => '00E7',
'D' => '1E10',
'd' => '1E11',
'E' => '0228', #C moz-1.2
'e' => '0229', #c moz-1.2
'G' => '0122',
'g' => '0123',
'H' => '1E28',
'h' => '1E29',
'K' => '0136',
'k' => '0137',
'L' => '013B',
'l' => '013C',
'N' => '0145',
'n' => '0146',
'R' => '0156',
'r' => '0157',
'S' => '015E',
's' => '015F',
'T' => '0162',
't' => '0163',
},
'=' => { # macron
'A' => '0100',
'a' => '0101',
'E' => '0112',
'e' => '0113',
'I' => '012A',
'i' => '012B',
'G' => '1E20',
'g' => '1E21',
'O' => '014C',
'o' => '014D',
'U' => '016A',
'u' => '016B',
'Y' => '0232', #? moz-1.2
'y' => '0233', #? moz-1.2
},
'"' => { # diaeresis
'A' => '00C4',
'a' => '00E4',
'E' => '00CB',
'e' => '00EB',
'H' => '1E26',
'h' => '1E27',
'I' => '00CF',
'i' => '00EF',
'O' => '00D6',
'o' => '00F6',
't' => '1E97',
'U' => '00DC',
'u' => '00FC',
'W' => '1E84',
'w' => '1E85',
'X' => '1E8C',
'x' => '1E8D',
'y' => '00FF',
'Y' => '0178',
},
'u' => { # breve
'A' => '0102',
'a' => '0103',
'E' => '0114',
'e' => '0115',
'G' => '011E',
'g' => '011F',
'I' => '012C',
'i' => '012D',
'O' => '014E',
'o' => '014F',
'U' => '016C',
'u' => '016D',
},
"'" => { # acute
'A' => '00C1',
'a' => '00E1',
'C' => '0106',
'c' => '0107',
'E' => '00C9',
'e' => '00E9',
'G' => '01F4',
'g' => '01F5',
'I' => '00CD',
'i' => '00ED',
'K' => '1E30',
'k' => '1E31',
'L' => '0139',
'l' => '013A',
'M' => '1E3E',
'm' => '1E3F',
'N' => '0143',
'n' => '0144',
'O' => '00D3',
'o' => '00F3',
'P' => '1E54',
'p' => '1E55',
'R' => '0154',
'r' => '0155',
'S' => '015A',
's' => '015B',
'U' => '00DA',
'u' => '00FA',
'W' => '1E82',
'w' => '1E83',
'Y' => '00DD',
'y' => '00FD',
'Z' => '0179',
'z' => '018A',
},
'~' => { # tilde
'A' => '00C3',
'a' => '00E3',
'E' => '1EBC',
'e' => '1EBD',
'I' => '0128',
'i' => '0129',
'N' => '00D1',
'n' => '00F1',
'O' => '00D5',
'o' => '00F5',
'U' => '0168',
'u' => '0169',
'V' => '1E7C',
'v' => '1E7D',
'Y' => '1EF8',
'y' => '1EF9',
},
'`' => { # grave
'A' => '00C0',
'a' => '00E0',
'E' => '00C8',
'e' => '00E8',
'I' => '00CC',
'i' => '00EC',
'N' => '01F8',
'n' => '01F9',
'O' => '00D2',
'o' => '00F2',
'U' => '00D9',
'u' => '00F9',
'W' => '1E80',
'w' => '1E81',
'Y' => '1EF2',
'y' => '1EF3',
},
'^' => { # circumflex
'A' => '00C2',
'a' => '00E2',
'C' => '0108',
'c' => '0109',
'E' => '00CA',
'e' => '00EA',
'G' => '011C',
'g' => '011D',
'H' => '0124',
'h' => '0125',
'I' => '00CE',
'i' => '00EE',
'J' => '0134',
'j' => '0135',
'O' => '00D4',
'o' => '00F4',
'S' => '015C',
's' => '015D',
'U' => '00DB',
'u' => '00FB',
'W' => '0174',
'w' => '0175',
'Y' => '0176',
'y' => '0177',
'Z' => '1E90',
'z' => '1E91',
},
'ringaccent' => { # ring
'A' => '00C5',
'a' => '00E5',
'U' => '016E',
'u' => '016F',
'w' => '1E98',
'y' => '1E99',
},
'v' => { # caron
'A' => '01CD',
'a' => '01CE',
'C' => '010C',
'c' => '010D',
'D' => '010E',
'd' => '010F',
'E' => '011A',
'e' => '011B',
'G' => '01E6',
'g' => '01E7',
'H' => '021E', #K with moz-1.2
'h' => '021F', #k with moz-1.2
'I' => '01CF',
'i' => '01D0',
'K' => '01E8',
'k' => '01E9',
'L' => '013D', #L' with moz-1.2
'l' => '013E', #l' with moz-1.2
'N' => '0147',
'n' => '0148',
'O' => '01D1',
'o' => '01D2',
'R' => '0158',
'r' => '0159',
'S' => '0160',
's' => '0161',
'T' => '0164',
't' => '0165',
'U' => '01D3',
'u' => '01D4',
'Z' => '017D',
'z' => '017E',
},
'H' => { # double acute
'O' => '0150',
'o' => '0151',
'U' => '0170',
'u' => '0171',
},
'ogonek' => {
'A' => '0104',
'a' => '0105',
'E' => '0118',
'e' => '0119',
'I' => '012E',
'i' => '012F',
'U' => '0172',
'u' => '0173',
'O' => '01EA',
'o' => '01EB',
},
);
our %unicode_simple_character_map = (
' ' => '0020',
'!' => '0021',
'"' => '0022',
'#' => '0023',
'$' => '0024',
'%' => '0025',
'&' => '0026',
"'" => '0027',
'(' => '0028',
')' => '0029',
'*' => '002A',
'+' => '002B',
',' => '002C',
'-' => '002D',
'.' => '002E',
'/' => '002F',
':' => '003A',
';' => '003B',
'<' => '003C',
'=' => '003D',
'>' => '003E',
'?' => '003F',
'@' => '0040',
'[' => '005B',
'\\' => '005C',
']' => '005D',
'^' => '005E',
'_' => '005F',
'`' => '0060',
'{' => '007B',
'|' => '007C',
'}' => '007D',
'~' => '007E',
);
# Also discussed on the texinfo list.
# taken from
#Latin extended additionnal
#http://www.alanwood.net/unicode/latin_extended_additional.html
#C1 Controls and Latin-1 Supplement
#http://www.alanwood.net/unicode/latin_1_supplement.html
#Latin Extended-A
#http://www.alanwood.net/unicode/latin_extended_a.html
#Latin Extended-B
#http://www.alanwood.net/unicode/latin_extended_b.html
#dotless i: 0131
#http://www.alanwood.net/unicode/arrows.html 21**
#http://www.alanwood.net/unicode/general_punctuation.html 20**
#http://www.alanwood.net/unicode/mathematical_operators.html 22**
our %unicode_map = (
'bullet' => '2022',
'copyright' => '00A9',
'registeredsymbol' => '00AE',
'dots' => '2026',
'enddots' => '',
'equiv' => '2261',
'error' => '',
'expansion' => '21A6',
'arrow' => '2192',
'minus' => '2212', # in mathematical operators
# 'minus' => '002D', # in latin1
'point' => '2605',
'print' => '22A3',
'result' => '21D2',
'today' => '',
'aa' => '00E5',
'AA' => '00C5',
'ae' => '00E6',
'oe' => '0153',
'AE' => '00C6',
'OE' => '0152',
'o' => '00F8',
'O' => '00D8',
'ss' => '00DF',
'DH' => '00D0',
'dh' => '00F0',
'TH' => '00DE',
'th' => '00FE',
'l' => '0142',
'L' => '0141',
'exclamdown' => '00A1',
'questiondown' => '00BF',
'pounds' => '00A3',
'ordf' => '00AA',
'ordm' => '00BA',
'comma' => '002C',
'atchar' => '0040',
'lbracechar' => '007B',
'rbracechar' => '007D',
'backslashchar' => '005C',
'hashchar' => '0023',
'euro' => '20AC',
'geq' => '2265',
'leq' => '2264',
'tie' => '',
# 'tie' => '0020',
'textdegree' => '00B0',
'quotedblleft' => '201C',
'quotedblright' => '201D',
'quoteleft' => '2018',
'quoteright' => '2019',
'quotedblbase' => '201E',
'quotesinglbase' => '201A',
'guillemetleft' => '00AB',
'guillemetright' => '00BB',
'guillemotleft' => '00AB',
'guillemotright' => '00BB',
'guilsinglleft' => '2039',
'guilsinglright' => '203A',
# this should only happen if the @clickstyle argument isn't a
# command with braces and no argument.
'click' => '2192',
);
# set the %unicode_character_brace_no_arg_commands value to the character
# corresponding to the hex value in %unicode_map.
our %unicode_character_brace_no_arg_commands;
foreach my $command (keys(%unicode_map)) {
if ($unicode_map{$command} ne '') {
my $char_nr = hex($unicode_map{$command});
if ($char_nr > 126 and $char_nr < 255) {
# this is very strange, indeed. The reason lies certainly in the
# magic backward compatibility support in perl for 8bit encodings.
$unicode_character_brace_no_arg_commands{$command} =
Encode::decode("iso-8859-1", chr($char_nr));
} else {
$unicode_character_brace_no_arg_commands{$command} = chr($char_nr);
}
}
}
our %unicode_entities;
# set entities corresponding to unicode_map
foreach my $command (keys(%unicode_map)) {
$unicode_entities{$command}
= '&#'.hex($unicode_map{$command}).';'
if ($unicode_map{$command} ne '');
}
our %transliterate_map = (
'00C5' => 'AA',
'00E5' => 'aa',
'00D8' => 'O',
'00F8' => 'o',
'00E6' => 'ae',
'0153' => 'oe',
'00C6' => 'AE',
'0152' => 'OE',
'00DF' => 'ss',
'0141' => 'L',
'0142' => 'l',
'00D0' => 'D',
'00F0' => 'd',
'00DE' => 'TH',
'00FE' => 'th',
'0415' => 'E',
'0435' => 'e',
'0426' => 'C',
'042A' => 'W',
'044A' => 'w',
'042C' => 'X',
'044C' => 'x',
'042E' => 'yu',
'042F' => 'YA',
'044F' => 'ya',
'0433' => 'g',
'0446' => 'c',
'04D7' => 'IO',
'00DD' => 'Y', # unidecode gets this wrong ?
# following appears in tests, this is required to have
# the same output with and without unidecode
'4E2D' => 'Zhong',
'6587' => 'Wen',
'793A' => 'Shi',
'4F8B' => 'Li',
'7B2C' => 'Di',
'7AE0' => 'Zhang',
'53E6' => 'Ling',
'4E2A' => 'Ge',
# in http://www.cantonese.sheik.co.uk/dictionary/characters/7/
# unidecode certainly gets it wrong
'4E00' => 'Yi',
'FF08' => '(',
'FF09' => ')',
'FF0C' => ',',
'5B66' => 'Xue',
'7FD2' => 'Xi',
'30DE' => 'ma',
'30CB' => 'ni',
'30E5' => 'yu',
'30A2' => 'a',
'30EB' => 'ru',
);
our %no_transliterate_map;
foreach my $symbol(keys(%unicode_map)) {
if ($unicode_map{$symbol} ne ''
and !exists($transliterate_map{$symbol})) {
$no_transliterate_map{$unicode_map{$symbol}} = 1;
}
}
our %transliterate_accent_map;
foreach my $command (keys(%unicode_accented_letters)) {
foreach my $letter(keys (%{$unicode_accented_letters{$command}})) {
$transliterate_accent_map{$unicode_accented_letters{$command}->{$letter}} = $letter
unless (exists($transliterate_map{$unicode_accented_letters{$command}->{$letter}}));
}
}
my %unicode_to_eight_bit = (
'iso8859_1' => {
'00A0' => 'A0',
'00A1' => 'A1',
'00A2' => 'A2',
'00A3' => 'A3',
'00A4' => 'A4',
'00A5' => 'A5',
'00A6' => 'A6',
'00A7' => 'A7',
'00A8' => 'A8',
'00A9' => 'A9',
'00AA' => 'AA',
'00AB' => 'AB',
'00AC' => 'AC',
'00AD' => 'AD',
'00AE' => 'AE',
'00AF' => 'AF',
'00B0' => 'B0',
'00B1' => 'B1',
'00B2' => 'B2',
'00B3' => 'B3',
'00B4' => 'B4',
'00B5' => 'B5',
'00B6' => 'B6',
'00B7' => 'B7',
'00B8' => 'B8',
'00B9' => 'B9',
'00BA' => 'BA',
'00BB' => 'BB',
'00BC' => 'BC',
'00BD' => 'BD',
'00BE' => 'BE',
'00BF' => 'BF',
'00C0' => 'C0',
'00C1' => 'C1',
'00C2' => 'C2',
'00C3' => 'C3',
'00C4' => 'C4',
'00C5' => 'C5',
'00C6' => 'C6',
'00C7' => 'C7',
'00C7' => 'C7',
'00C8' => 'C8',
'00C9' => 'C9',
'00CA' => 'CA',
'00CB' => 'CB',
'00CC' => 'CC',
'00CD' => 'CD',
'00CE' => 'CE',
'00CF' => 'CF',
'00D0' => 'D0',
'00D1' => 'D1',
'00D2' => 'D2',
'00D3' => 'D3',
'00D4' => 'D4',
'00D5' => 'D5',
'00D6' => 'D6',
'00D7' => 'D7',
'00D8' => 'D8',
'00D9' => 'D9',
'00DA' => 'DA',
'00DB' => 'DB',
'00DC' => 'DC',
'00DD' => 'DD',
'00DE' => 'DE',
'00DF' => 'DF',
'00E0' => 'E0',
'00E1' => 'E1',
'00E2' => 'E2',
'00E3' => 'E3',
'00E4' => 'E4',
'00E5' => 'E5',
'00E6' => 'E6',
'00E7' => 'E7',
'00E8' => 'E8',
'00E9' => 'E9',
'00EA' => 'EA',
'00EB' => 'EB',
'00EC' => 'EC',
'00ED' => 'ED',
'00EE' => 'EE',
'00EF' => 'EF',
'00F0' => 'F0',
'00F1' => 'F1',
'00F2' => 'F2',
'00F3' => 'F3',
'00F4' => 'F4',
'00F5' => 'F5',
'00F6' => 'F6',
'00F7' => 'F7',
'00F8' => 'F8',
'00F9' => 'F9',
'00FA' => 'FA',
'00FB' => 'FB',
'00FC' => 'FC',
'00FD' => 'FD',
'00FE' => 'FE',
'00FF' => 'FF',
},
'iso8859_15' => {
'00A0' => 'A0',
'00A1' => 'A1',
'00A2' => 'A2',
'00A3' => 'A3',
'20AC' => 'A4',
'00A5' => 'A5',
'0160' => 'A6',
'00A7' => 'A7',
'0161' => 'A8',
'00A9' => 'A9',
'00AA' => 'AA',
'00AB' => 'AB',
'00AC' => 'AC',
'00AD' => 'AD',
'00AE' => 'AE',
'00AF' => 'AF',
'00B0' => 'B0',
'00B1' => 'B1',
'00B2' => 'B2',
'00B3' => 'B3',
'017D' => 'B4',
'00B5' => 'B5',
'00B6' => 'B6',
'00B7' => 'B7',
'017E' => 'B8',
'00B9' => 'B9',
'00BA' => 'BA',
'00BB' => 'BB',
'0152' => 'BC',
'0153' => 'BD',
'0178' => 'BE',
'00BF' => 'BF',
'00C0' => 'C0',
'00C1' => 'C1',
'00C2' => 'C2',
'00C3' => 'C3',
'00C4' => 'C4',
'00C5' => 'C5',
'00C6' => 'C6',
'00C7' => 'C7',
'00C8' => 'C8',
'00C9' => 'C9',
'00CA' => 'CA',
'00CB' => 'CB',
'00CC' => 'CC',
'00CD' => 'CD',
'00CE' => 'CE',
'00CF' => 'CF',
'00D0' => 'D0',
'00D1' => 'D1',
'00D2' => 'D2',
'00D3' => 'D3',
'00D4' => 'D4',
'00D5' => 'D5',
'00D6' => 'D6',
'00D7' => 'D7',
'00D8' => 'D8',
'00D9' => 'D9',
'00DA' => 'DA',
'00DB' => 'DB',
'00DC' => 'DC',
'00DD' => 'DD',
'00DE' => 'DE',
'00DF' => 'DF',
'00E0' => 'E0',
'00E1' => 'E1',
'00E2' => 'E2',
'00E3' => 'E3',
'00E4' => 'E4',
'00E5' => 'E5',
'00E6' => 'E6',
'00E7' => 'E7',
'00E8' => 'E8',
'00E9' => 'E9',
'00EA' => 'EA',
'00EB' => 'EB',
'00EC' => 'EC',
'00ED' => 'ED',
'00EE' => 'EE',
'00EF' => 'EF',
'00F0' => 'F0',
'00F1' => 'F1',
'00F2' => 'F2',
'00F3' => 'F3',
'00F4' => 'F4',
'00F5' => 'F5',
'00F6' => 'F6',
'00F7' => 'F7',
'00F8' => 'F8',
'00F9' => 'F9',
'00FA' => 'FA',
'00FB' => 'FB',
'00FC' => 'FC',
'00FD' => 'FD',
'00FE' => 'FE',
'00FF' => 'FF',
},
'iso8859_2' => {
'00A0' => 'A0',
'0104' => 'A1',
'02D8' => 'A2',
'0141' => 'A3',
'00A4' => 'A4',
'013D' => 'A5',
'015A' => 'A6',
'00A7' => 'A7',
'00A8' => 'A8',
'015E' => 'AA',
'0164' => 'AB',
'0179' => 'AC',
'00AD' => 'AD',
'017D' => 'AE',
'017B' => 'AF',
'00B0' => 'B0',
'0105' => 'B1',
'02DB' => 'B2',
'0142' => 'B3',
'00B4' => 'B4',
'013E' => 'B5',
'015B' => 'B6',
'02C7' => 'B7',
'00B8' => 'B8',
'0161' => 'B9',
'015F' => 'BA',
'0165' => 'BB',
'017A' => 'BC',
'02DD' => 'BD',
'017E' => 'BE',
'017C' => 'BF',
'0154' => 'C0',
'00C1' => 'C1',
'00C2' => 'C2',
'0102' => 'C3',
'00C4' => 'C4',
'0139' => 'C5',
'0106' => 'C6',
'00C7' => 'C7',
'010C' => 'C8',
'00C9' => 'C9',
'0118' => 'CA',
'00CB' => 'CB',
'011A' => 'CC',
'00CD' => 'CD',
'00CE' => 'CE',
'010E' => 'CF',
'0110' => 'D0',
'0143' => 'D1',
'0147' => 'D2',
'00D3' => 'D3',
'00D4' => 'D4',
'0150' => 'D5',
'00D6' => 'D6',
'00D7' => 'D7',
'0158' => 'D8',
'016E' => 'D9',
'00DA' => 'DA',
'0170' => 'DB',
'00DC' => 'DC',
'00DD' => 'DD',
'0162' => 'DE',
'00DF' => 'DF',
'0155' => 'E0',
'00E1' => 'E1',
'00E2' => 'E2',
'0103' => 'E3',
'00E4' => 'E4',
'013A' => 'E5',
'0107' => 'E6',
'00E7' => 'E7',
'010D' => 'E8',
'00E9' => 'E9',
'0119' => 'EA',
'00EB' => 'EB',
'011B' => 'EC',
'00ED' => 'ED',
'00EE' => 'EE',
'010F' => 'EF',
'0111' => 'F0',
'0144' => 'F1',
'0148' => 'F2',
'00F3' => 'F3',
'00F4' => 'F4',
'0151' => 'F5',
'00F6' => 'F6',
'00F7' => 'F7',
'0159' => 'F8',
'016F' => 'F9',
'00FA' => 'FA',
'0171' => 'FB',
'00FC' => 'FC',
'00FD' => 'FD',
'0163' => 'FE',
'02D9' => 'FF',
},
'koi8' => {
'0415' => 'A3',
'0454' => 'A4',
'0456' => 'A6',
'0457' => 'A7',
'04D7' => 'B3',
'0404' => 'B4',
'0406' => 'B6',
'0407' => 'B7',
'042E' => 'C0',
'0430' => 'C1',
'0431' => 'C2',
'0446' => 'C3',
'0434' => 'C4',
'0435' => 'C5',
'0444' => 'C6',
'0433' => 'C7',
'0445' => 'C8',
'0438' => 'C9',
'0439' => 'CA',
'043A' => 'CB',
'043B' => 'CC',
'043C' => 'CD',
'043D' => 'CE',
'043E' => 'CF',
'043F' => 'D0',
'044F' => 'D1',
'0440' => 'D2',
'0441' => 'D3',
'0442' => 'D4',
'0443' => 'D5',
'0436' => 'D6',
'0432' => 'D7',
'044C' => 'D8',
'044B' => 'D9',
'0437' => 'DA',
'0448' => 'DB',
'044D' => 'DC',
'0449' => 'DD',
'0447' => 'DE',
'044A' => 'DF',
'042D' => 'E0',
'0410' => 'E1',
'0411' => 'E2',
'0426' => 'E3',
'0414' => 'E4',
'0415' => 'E5',
'0424' => 'E6',
'0413' => 'E7',
'0425' => 'E8',
'0418' => 'E9',
'0419' => 'EA',
'041A' => 'EB',
'041B' => 'EC',
'041C' => 'ED',
'041D' => 'EE',
'041E' => 'EF',
'041F' => 'F0',
'042F' => 'F1',
'0420' => 'F2',
'0421' => 'F3',
'0422' => 'F4',
'0423' => 'F5',
'0416' => 'F6',
'0412' => 'F7',
'042C' => 'F8',
'042B' => 'F9',
'0417' => 'FA',
'0428' => 'FB',
'042D' => 'FC',
'0429' => 'FD',
'0427' => 'FE',
'042A' => 'FF',
},
);
# currently unused
my %makeinfo_transliterate_map = (
'0416' => 'ZH',
'0447' => 'ch',
'00EB' => 'e',
'0414' => 'D',
'0159' => 'r',
'00E6' => 'ae',
'042B' => 'Y',
'00FA' => 'u',
'043B' => 'l',
'00DE' => 'TH',
'00D9' => 'U',
'00C4' => 'A',
'0148' => 'n',
'00F6' => 'o',
'0434' => 'd',
'041E' => 'O',
'041B' => 'L',
'044B' => 'y',
'0107' => 'c',
'0415' => 'E',
'00C1' => 'A',
'00D3' => 'O',
'00DB' => 'U',
'016E' => 'U',
'013A' => 'l',
'017B' => 'Z',
'00F1' => 'n',
'0428' => 'SH',
'0153' => 'oe',
'00F4' => 'o',
'0144' => 'n',
'0404' => 'IE',
'0427' => 'CH',
'0162' => 'T',
'017A' => 'z',
'0448' => 'sh',
'0436' => 'zh',
'00F9' => 'u',
'0406' => 'I',
'0103' => 'a',
'0422' => 'T',
'0160' => 'S',
'0165' => 't',
'017E' => 'z',
'00F0' => 'd',
'043E' => 'o',
'043D' => 'n',
'013E' => 'l',
'0412' => 'V',
'0111' => 'd',
'0155' => 's',
'017C' => 'z',
'00CE' => 'I',
'042D' => 'E',
'00C8' => 'E',
'00F8' => 'oe',
'00F2' => 'o',
'00FF' => 'y',
'0420' => 'R',
'0119' => 'e',
'00D2' => 'O',
'043C' => 'm',
'00D0' => 'DH',
'0179' => 'Z',
'0110' => 'D',
'043F' => 'p',
'0170' => 'U',
'011A' => 'E',
'010C' => 'C',
'015A' => 'S',
'0433' => 'g',
'00E1' => 'a',
'010D' => 'c',
'00CC' => 'I',
'016F' => 'u',
'0457' => 'yi',
'00C2' => 'A',
'0438' => 'i',
'00E3' => 'a',
'0435' => 'e',
'0440' => 'r',
'042A' => 'W',
'0431' => 'b',
'00EE' => 'i',
'0150' => 'O',
'00E8' => 'e',
'0418' => 'I',
'00CF' => 'I',
'015F' => 's',
'0142' => 'l',
'0147' => 'N',
'00DF' => 'ss',
'00E5' => 'aa',
'00C3' => 'A',
'0106' => 'C',
'0141' => 'L',
'0164' => 'T',
'017D' => 'Z',
'00EC' => 'i',
'041C' => 'M',
'00C9' => 'E',
'00E0' => 'a',
'043A' => 'k',
'00F5' => 'o',
'042C' => 'X',
'0449' => 'shch',
'0444' => 'f',
'0139' => 'L',
'0158' => 'R',
'00F3' => 'o',
'00FB' => 'u',
'0424' => 'F',
'0446' => 'c',
'0423' => 'U',
'0442' => 't',
'00FD' => 'y',
'0102' => 'A',
'0104' => 'A',
'00CB' => 'E',
'0426' => 'C',
'00CD' => 'I',
'0437' => 'z',
'0178' => 'y',
'00D4' => 'O',
'044D' => 'e',
'0432' => 'v',
'013D' => 'L',
'0163' => 't',
'0456' => 'i',
'011B' => 'e',
'044F' => 'ya',
'0429' => 'SHCH',
'0411' => 'B',
'044A' => 'w',
'00C6' => 'AE',
'041D' => 'N',
'00DA' => 'U',
'00C0' => 'A',
'0152' => 'OE',
'00DD' => 'Y',
'0154' => 'R',
'00E9' => 'e',
'00D5' => 'O',
'041F' => 'P',
'0161' => 's',
'0430' => 'a',
'0445' => 'h',
'00E2' => 'a',
'00D6' => 'O',
'0407' => 'YI',
'00CA' => 'E',
'0439' => 'i',
'0171' => 'u',
'00DC' => 'U',
'042F' => 'YA',
'0425' => 'H',
'00FE' => 'th',
'00D1' => 'N',
'044C' => 'x',
'010F' => 'd',
'0410' => 'A',
'0443' => 'u',
'00EF' => 'i',
'0105' => 'a',
'00EA' => 'e',
'00E4' => 'a',
'015E' => 'S',
'0417' => 'Z',
'00ED' => 'i',
'00FC' => 'u',
'04D7' => 'IO',
'00D8' => 'OE',
'0419' => 'I',
'0421' => 'S',
'0143' => 'N',
'010E' => 'D',
'0413' => 'G',
'015B' => 's',
'0151' => 'o',
'00E7' => 'c',
'00C5' => 'AA',
'0441' => 's',
'0118' => 'E',
'00C7' => 'C',
'041A' => 'K',
'0454' => 'ie',
'042E' => 'yu',
);
sub unicode_accent($$)
{
my $text = shift;
my $command = shift;
my $accent = $command->{'cmdname'};
my $result;
# special handling of @dotless{i}.
# \x{0131}\x{0308} for @dotless{i} @" doesn't lead to NFC 00ef.
# so it is set to a real dotless i only if not in an accent command.
if ($accent eq 'dotless') {
if ($text eq 'i' and (!$command->{'parent'}
or !$command->{'parent'}->{'parent'}
or !$command->{'parent'}->{'parent'}->{'cmdname'}
or !$unicode_accented_letters{$command->{'parent'}->{'parent'}->{'cmdname'}})) {
$result = "\x{0131}";
} else {
$result = $text;
}
return $result;
}
if (defined($unicode_diacritics{$accent})) {
$result = Unicode::Normalize::NFC($text . chr(hex($unicode_diacritics{$accent})));
return $result;
} else {
return undef;
}
}
sub unicode_text($$)
{
my $text = shift;
my $in_code = shift;
if (!$in_code) {
$text =~ s/---/\x{2014}/g;
$text =~ s/--/\x{2013}/g;
$text =~ s/``/\x{201C}/g;
$text =~ s/''/\x{201D}/g;
$text =~ s/'/\x{2019}/g;
$text =~ s/`/\x{2018}/g;
}
return $text;
}
# return the 8 bit, if it exists, and the unicode codepoint
sub _eight_bit_and_unicode_point($$)
{
my $char = shift;
my $encoding = shift;
my $encoding_map_name
= $Texinfo::Encoding::eight_bit_encoding_aliases{$encoding};
my ($eight_bit, $codepoint);
if (ord($char) <= 128) {
# 7bit ascii characters, the same in every 8bit encodings
$eight_bit = uc(sprintf("%02x",ord($char)));
$codepoint = uc(sprintf("%04x",ord($char)));
} elsif (ord($char) <= hex(0xFFFF)) {
$codepoint = uc(sprintf("%04x",ord($char)));
if (exists($unicode_to_eight_bit{$encoding_map_name}->{$codepoint})) {
$eight_bit
= $unicode_to_eight_bit{$encoding_map_name}->{$codepoint};
}
}
return ($eight_bit, $codepoint);
}
# format a stack of accents as unicode
sub unicode_accents($$$$;$)
{
my $converter = shift;
my $result = shift;
my $stack = shift;
my $format_accent = shift;
my $set_case = shift;
while (@$stack) {
my $formatted_result = unicode_accent($result, $stack->[-1]);
last if (!defined($formatted_result));
$result = $formatted_result;
pop @$stack;
}
if ($set_case) {
if ($set_case > 0) {
$result = uc ($result);
} else {
$result = lc ($result);
}
}
while (@$stack) {
my $accent_command = pop @$stack;
$result = &$format_accent($converter, $result, $accent_command, $set_case);
}
return $result;
}
sub eight_bit_accents($$$$$;$)
{
my $converter = shift;
my $unicode_formatted = shift;
my $stack = shift;
my $encoding = shift;
my $convert_accent = shift;
my $set_case = shift;
my $result = $unicode_formatted;
my $debug;
#$debug = 1;
if ($debug) {
print STDERR "STACK: ".join('|', map {$_->{'cmdname'}} @$stack)."\n";
}
# accents are formatted and the intermediate results are kept, such
# that we can return the maximum of multiaccented letters that can be
# rendered with a given eight bit formatting. undef is stored when
# there is no corresponding unicode anymore.
my @results_stack = ([$unicode_formatted, undef]);
while (@$stack) {
if (defined($unicode_formatted)) {
$unicode_formatted
= unicode_accent($unicode_formatted, $stack->[-1]);
if (defined($unicode_formatted) and $set_case) {
if ($set_case > 0) {
$unicode_formatted = uc($unicode_formatted);
} else {
$unicode_formatted = lc($unicode_formatted);
}
}
}
push @results_stack, [$unicode_formatted, $stack->[-1]];
pop @$stack;
}
if ($debug) {
print STDERR "PARTIAL_RESULTS_STACK:\n";
foreach my $partial_result (@results_stack) {
my $command = 'TEXT';
$command = $partial_result->[1]->{'cmdname'} if ($partial_result->[1]);
if (defined($partial_result->[0])) {
print STDERR " -> ".Encode::encode('utf8', $partial_result->[0])
."|$command\n";
} else {
print STDERR " -> NO UTF8 |$command\n";
}
}
}
# At this point we have the utf8 encoded results for the accent
# commands stack, with all the intermediate results.
# For each one we'll check if it is possible to encode it in the
# current eight bit output encoding table and, if so set the result
# to the character.
my $eight_bit = '';
while (@results_stack) {
my $char = $results_stack[0]->[0];
last if (!defined($char));
my ($new_eight_bit, $new_codepoint)
= _eight_bit_and_unicode_point($char, $encoding);
if ($debug) {
my $command = 'TEXT';
$command = $results_stack[0]->[1]->{'cmdname'}
if ($results_stack[0]->[1]);
my $new_eight_bit_txt = 'UNDEF';
$new_eight_bit_txt = $new_eight_bit if (defined($new_eight_bit));
print STDERR "" . Encode::encode('utf8', $char)
. " ($command) new_codepoint: $new_codepoint 8bit: $new_eight_bit_txt old: $eight_bit\n";
}
# no corresponding eight bit character found for a composed character
last if (!$new_eight_bit);
# in that case, the new eight bit character is the same than the one
# found with one less character (and it isn't a @dotless{i}). It may
# hapen in 2 case, both meaning that there is no corresponding 8bit char:
#
# -> there are 2 characters in accent. This could happen, for example
# if an accent that cannot be rendered is found and it leads to
# appending or prepending a character. For example this happens for
# @={@,{@~{n}}}, where @,{@~{n}} is expanded to a 2 character:
# n with a tilde, followed by a ,
# In that case, the additional utf8 diacritic is appended, which
# means that it is composed with the , and leaves n with a tilde
# untouched.
# -> the diacritic is appended but the normal form doesn't lead
# to a composed character, such that the first character
# of the string is unchanged. This, for example, happens for
# @ubaraccent{a} since there is no composed accent with a and an
# underbar.
last if ($new_eight_bit eq $eight_bit
and !($results_stack[0]->[1]->{'cmdname'} eq 'dotless'
and $char eq 'i'));
$result = $results_stack[0]->[0];
$eight_bit = $new_eight_bit;
shift @results_stack;
}
# handle the remaining accents, that have not been converted to 8bit
# compatible unicode
shift @results_stack if (!defined($results_stack[0]->[1]));
while (@results_stack) {
$result = &$convert_accent($converter, $result,
$results_stack[0]->[1],
$set_case);
shift @results_stack;
}
# An important remark is that the final conversion to 8bit is left to
# perl.
return $result;
}
sub encoded_accents($$$$$;$)
{
my $converter = shift;
my $text = shift;
my $stack = shift;
my $encoding = shift;
my $format_accent = shift;
my $set_case = shift;
if ($encoding) {
if ($encoding eq 'utf-8') {
return unicode_accents($converter, $text, $stack, $format_accent,
$set_case);
} elsif ($Texinfo::Encoding::eight_bit_encoding_aliases{$encoding}) {
return eight_bit_accents($converter, $text, $stack, $encoding,
$format_accent, $set_case);
}
}
return undef;
}
# returns the unicode for a command with brace and no arg
# if it is known that it is present for the encoding
sub unicode_for_brace_no_arg_command($$) {
my $command = shift;
my $encoding = shift;
if ($unicode_character_brace_no_arg_commands{$command}
and $encoding
and ($encoding eq 'utf-8'
or ($Texinfo::Encoding::eight_bit_encoding_aliases{$encoding}
and $unicode_to_eight_bit{$Texinfo::Encoding::eight_bit_encoding_aliases{$encoding}}->{$unicode_map{$command}}))) {
return $unicode_character_brace_no_arg_commands{$command};
} else {
return undef;
}
}
# string length size taking into account that east asian characters
# may take 2 spaces.
sub string_width($)
{
my $string = shift;
$string =~ s/\p{InFullwidth}/\x{02}/g;
$string =~ s/\pM/\x{00}/g;
$string =~ s/\p{IsPrint}/\x{01}/g;
$string =~ s/\p{IsSpace}/\x{01}/g;
$string =~ s/[^\x{01}\x{02}]/\x{00}/g;
# This sums up the byte values of the bytes in $string, which now are
# all either 0, 1 or 2. This is faster. The original, more readable
# version is below.
return unpack("U0%32A*", $string);
if (! defined($string)) {
Carp::cluck();
}
my $width = 0;
foreach my $character(split '', $string) {
if ($character =~ /\p{InFullwidth}/) {
$width += 2;
} elsif ($character =~ /\pM/) {
# a mark set at length 0
} elsif ($character =~ /\p{IsPrint}/ or $character =~ /\p{IsSpace}/) {
# newlines are not {IsPrint} in v5.14.2
$width += 1;
} elsif ($character =~ /\p{IsControl}/) {
# Control chars may be added, for instance, as part of @image formatting
#} elsif ($character eq '') { # could that happen?
} else {
#print STDERR "unknown char`$character'\n";
}
}
return $width;
}
1;
__END__
=head1 NAME
Texinfo::Convert::Unicode - Handle conversion to Unicode
=head1 SYNOPSIS
use Texinfo::Convert::Unicode qw(unicode_accent encoded_accents
unicode_text);
my ($innermost_contents, $stack)
= Texinfo::Common::find_innermost_accent_contents($accent);
my $formatted_accents = encoded_accents ($converter,
convert($innermost_contents), $stack, $encoding,
\&Texinfo::Text::ascii_accent_fallback);
my $accent_text = unicode_accent('e', $accent_command);
=head1 DESCRIPTION
Texinfo::Convert::Unicode provides methods that deals with unicode for
converters. Unicode is important, because it is used internally in perl
for strings, therefore if converted to Unicode, a string could be output
in other encodings as well when writting out the converted documents.
When an encoding is given as argument of a method of the module, the
accented letters should only be converted to unicode if it is known that
it will be possible to convert the unicode points to encoded charactes
in the encoding character set.
=head1 METHODS
=over
=item $result = unicode_accent($text, $accent_command)
I<$text> is the text appearing within an accent command. I<$accent_command>
should be a Texinfo tree element corresponding to an accent command taking
an argument. The function returns the unicode representation of the accented
character.
=item $result = encoded_accents ($converter, $text, $stack, $encoding, $format_accent, $set_case)
I<$converter> is a converter object. It may be undef if there is no need of
converter object in I<$format_accent> (I<$format_accent> described below).
I<$text> is the text appearing within nested accent commands. I<$stack> is
an array reference holding the nested accents texinfo element trees. For
example, I<$text> could be the formatted content and I<$stack> the stack
returned by C<Texinfo::Common::find_innermost_accent_contents>. I<$encoding>
is the encoding the accented characters should be encoded to. If
I<$encoding> not set the I<$result> is set to undef. I<$format_accent>
is a function reference that is used to format the accent commands if
there is no encoded character available for the encoding I<$encoding>
at some point of the conversion of the I<$stack>. Last, if
I<$set_case> is positive, the result is upper-cased, while if it is negative,
the result is lower-cased.
=item $result = unicode_text ($text, $in_code)
Return I<$text> with characters encoded in unicode. If I<$in_code>
is set, the text is considered to be in code style.
=item $result = unicode_for_brace_no_arg_command($command_name, $encoding)
Return the unicode representing a command with brace and no argument
I<$command_name> (like C<@bullet{}>, C<@aa{}> or C<@guilsinglleft{}>),
or undef if there is no available encoded character for encoding
I<$encoding>.
=item $width = string_width($string)
Return the string width, taking into account the fact that some characters
have a zero width (like composing accents) while some have a width of 2
(most chinese characters, for example).
=back
=head1 AUTHOR
Patrice Dumas, E<lt>pertusus@free.frE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright 2010, 2011, 2012 Free Software Foundation, Inc.
This library is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License,
or (at your option) any later version.
=cut