blob: e230a82107a662de038d1d4e4e78f447adeac6ff [file] [log] [blame]
# NodeNameNormalization.pm: output tree as normalized node name.
#
# Copyright 2010, 2011, 2012 Free Software Foundation, Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License,
# or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Original author: Patrice Dumas <pertusus@free.fr>
# the rules for conversion are decribed in the Texinfo manual, for
# HTML crossrefs.
package Texinfo::Convert::NodeNameNormalization;
use 5.00405;
use strict;
use Unicode::Normalize;
use Text::Unidecode;
# for the accents definition
use Texinfo::Common;
# reuse some conversion hashes
use Texinfo::Convert::Text;
# use the hashes and functions
use Texinfo::Convert::Unicode;
require Exporter;
use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
@ISA = qw(Exporter);
# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.
# This allows declaration use Texinfo::Convert::NodeNameNormalization ':all';
# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
# will save memory.
%EXPORT_TAGS = ( 'all' => [ qw(
normalize_node
transliterate_texinfo
) ] );
@EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
@EXPORT = qw(
);
my %normalize_node_brace_no_arg_commands
= %Texinfo::Convert::Text::text_brace_no_arg_commands;
foreach my $command (keys(%Texinfo::Convert::Unicode::unicode_character_brace_no_arg_commands)) {
$normalize_node_brace_no_arg_commands{$command} =
$Texinfo::Convert::Unicode::unicode_character_brace_no_arg_commands{$command};
}
my %normalize_node_no_brace_commands
= %Texinfo::Common::no_brace_commands;
$normalize_node_no_brace_commands{'*'} = ' ';
my %accent_commands = %Texinfo::Common::accent_commands;
my %ignored_brace_commands;
foreach my $ignored_brace_command (#'xref','ref', 'pxref', 'inforef',
'anchor', 'footnote', 'shortcaption', 'caption', 'hyphenation') {
$ignored_brace_commands{$ignored_brace_command} = 1;
}
my %ignored_types;
foreach my $type ('empty_line_after_command', 'preamble',
'empty_spaces_after_command', 'spaces_at_end',
'empty_spaces_before_argument', 'empty_spaces_before_paragraph',
'space_at_end_menu_node',
'empty_spaces_after_close_brace',
'empty_space_at_end_def_bracketed') {
$ignored_types{$type} = 1;
}
sub normalize_node($)
{
my $root = shift;
my $result = convert($root);
$result = Unicode::Normalize::NFC($result);
$result = _unicode_to_protected($result);
$result = 'Top' if ($result =~ /^Top$/i);
return $result;
}
sub transliterate_texinfo($;$)
{
my $root = shift;
my $no_unidecode = shift;
my $result = convert($root);
$result = Unicode::Normalize::NFC($result);
$result = _unicode_to_protected(
_unicode_to_transliterate($result, $no_unidecode));
return $result;
}
sub convert($)
{
my $root = shift;
my $result = _convert($root);
}
sub _unicode_to_protected($)
{
my $text = shift;
my $result = '';
while ($text ne '') {
if ($text =~ s/^([A-Za-z0-9]+)//o) {
$result .= $1;
} elsif ($text =~ s/^ //o) {
$result .= '-';
} elsif ($text =~ s/^(.)//o) {
my $char = $1;
if (exists($Texinfo::Convert::Unicode::unicode_simple_character_map{$char})) {
$result .= '_' . lc($Texinfo::Convert::Unicode::unicode_simple_character_map{$char});
} else {
if (ord($char) <= hex(0xFFFF)) {
$result .= '_' . lc(sprintf("%04x",ord($char)));
} else {
$result .= '__' . lc(sprintf("%06x",ord($char)));
}
}
} else {
warn "Bug: unknown character _unicode_to_protected (likely in infinite loop)\n";
print STDERR "Text: !!$text!!\n";
sleep 1;
}
}
return $result;
}
sub _unicode_to_transliterate($;$)
{
my $text = shift;
my $no_unidecode = shift;
if (chomp($text)) {
warn "Bug: end of line to transliterate: $text\n";
}
my $result = '';
while ($text ne '') {
if ($text =~ s/^([A-Za-z0-9 ]+)//o) {
$result .= $1;
} elsif ($text =~ s/^(.)//o) {
my $char = $1;
if (exists($Texinfo::Convert::Unicode::unicode_simple_character_map{$char})) {
$result .= $char;
} elsif (ord($char) <= hex(0xFFFF)
and exists($Texinfo::Convert::Unicode::transliterate_map{uc(sprintf("%04x",ord($char)))})) {
$result .= $Texinfo::Convert::Unicode::transliterate_map{uc(sprintf("%04x",ord($char)))};
} elsif (ord($char) <= hex(0xFFFF)
and exists($Texinfo::Convert::Unicode::diacritics_accent_commands{uc(sprintf("%04x",ord($char)))})) {
$result .= '';
# in this case, we want to avoid calling unidecode, as we are sure
# that there is no useful transliteration of the unicode character
# instead we want to keep it as is.
# This is the case, for example, for @exclamdown, is corresponds
# with x00a1, but unidecode transliterates it to a !, we want
# to avoid that and keep x00a1.
} elsif (ord($char) <= hex(0xFFFF)
and exists($Texinfo::Convert::Unicode::no_transliterate_map{uc(sprintf("%04x",ord($char)))})) {
$result .= $char;
} else {
if ($no_unidecode) {
if (ord($char) <= hex(0xFFFF)
and exists ($Texinfo::Convert::Unicode::transliterate_accent_map{uc(sprintf("%04x",ord($char)))})) {
$result .= $Texinfo::Convert::Unicode::transliterate_accent_map{uc(sprintf("%04x",ord($char)))};
} else {
$result .= $char;
}
} else {
$result .= unidecode($char);
}
}
#print STDERR " ($no_unidecode) $text -> CHAR: ".ord($char)." ".uc(sprintf("%04x",ord($char)))."\n$result\n";
} else {
warn "Bug: unknown character _unicode_to_transliterate (likely in infinite loop)\n";
print STDERR "Text: !!$text!!\n";
sleep 1;
}
}
return $result;
}
sub _convert($;$);
sub _convert($;$)
{
my $root = shift;
my $in_sc = shift;
if (0) {
print STDERR "root\n";
print STDERR " Command: $root->{'cmdname'}\n" if ($root->{'cmdname'});
print STDERR " Type: $root->{'type'}\n" if ($root->{'type'});
print STDERR " Text: $root->{'text'}\n" if (defined($root->{'text'}));
#print STDERR " Special def_command: $root->{'extra'}->{'def_command'}\n"
# if (defined($root->{'extra'}) and $root->{'extra'}->{'def_command'});
}
return '' if (($root->{'type'} and $ignored_types{$root->{'type'}})
or ($root->{'cmdname'}
and ($ignored_brace_commands{$root->{'cmdname'}}
# here ignore the misc commands
or ($root->{'args'} and $root->{'args'}->[0]
and $root->{'args'}->[0]->{'type'}
and ($root->{'args'}->[0]->{'type'} eq 'misc_line_arg'
or $root->{'args'}->[0]->{'type'} eq 'misc_arg')))));
my $result = '';
if (defined($root->{'text'})) {
$result = $root->{'text'};
$result =~ s/\s+/ /go;
$result = uc($result) if ($in_sc);
}
if ($root->{'cmdname'}) {
my $command = $root->{'cmdname'};
if (defined($normalize_node_no_brace_commands{$root->{'cmdname'}})) {
return $normalize_node_no_brace_commands{$root->{'cmdname'}};
} elsif (defined($normalize_node_brace_no_arg_commands{$root->{'cmdname'}})) {
$command = $root->{'extra'}->{'clickstyle'}
if ($root->{'extra'}
and defined($root->{'extra'}->{'clickstyle'})
and defined($normalize_node_brace_no_arg_commands{$root->{'extra'}->{'clickstyle'}}));
my $result = $normalize_node_brace_no_arg_commands{$command};
if ($in_sc and $Texinfo::Common::letter_no_arg_commands{$command}) {
$result = uc($result);
}
return $result;
# commands with braces
} elsif ($accent_commands{$root->{'cmdname'}}) {
return '' if (!$root->{'args'});
my $accent_text = _convert($root->{'args'}->[0]);
my $accented_char
= Texinfo::Convert::Unicode::unicode_accent($accent_text,
$root);
if (!defined($accented_char)) {
# In this case, the node normalization do not follow the specification,
# but we cannot do better
$accented_char = Texinfo::Convert::Text::ascii_accent($accent_text,
$root);
}
if ($in_sc) {
return uc ($accented_char);
} else {
return $accented_char;
}
#} elsif ($root->{'cmdname'} eq 'image') {
# return _convert($root->{'args'}->[0]);
} elsif ($Texinfo::Common::ref_commands{$root->{'cmdname'}}) {
my @args_try_order;
if ($root->{'cmdname'} eq 'inforef') {
@args_try_order = (0, 1, 2);
} else {
@args_try_order = (0, 1, 2, 4, 3);
}
foreach my $index (@args_try_order) {
if (defined($root->{'args'}->[$index])) {
my $text = _convert($root->{'args'}->[$index]);
return $text if (defined($text) and $text =~ /\S/);
}
}
return '';
#} elsif ($root->{'cmdname'} eq 'email') {
# my $mail = _convert($root->{'args'}->[0]);
# return $mail if (defined($mail) and $mail ne '');
# my $text;
# $text = _convert($root->{'args'}->[1])
# if (defined($root->{'args'}->[1]));
# return $text if (defined($text) and ($text ne ''));
# #return $mail;
# return '';
# Here all the commands with args are processed, if they have
# more than one arg the first one is used.
} elsif ($root->{'args'} and $root->{'args'}->[0]
and (($root->{'args'}->[0]->{'type'}
and $root->{'args'}->[0]->{'type'} eq 'brace_command_arg')
or $root->{'cmdname'} eq 'math')) {
my $sc = 1 if ($root->{'cmdname'} eq 'sc' or $in_sc);
return _convert($root->{'args'}->[0], $sc);
}
}
if ($root->{'contents'}) {
foreach my $content (@{$root->{'contents'}}) {
$result .= _convert($content, $in_sc);
}
}
$result = '{'.$result.'}'
if ($root->{'type'} and $root->{'type'} eq 'bracketed');
return $result;
}
1;
__END__
=head1 NAME
Texinfo::Convert::NodeNameNormalization - Normalize and transliterate Texinfo trees
=head1 SYNOPSIS
use Texinfo::Convert::NodeNameNormalization qw(normalize_node
transliterate_texinfo);
my $normalized = normalize_node({'contents' => $node_contents});
my $file_name = transliterate_texinfo({'contents'
=> $section_contents});
=head1 DESCRIPTION
Texinfo::Convert::NodeNameNormalization allows to normalize node names,
with C<normalize_node> following the specification described in the
Texinfo manual for HTML Xref. This is usefull each time one want a
unique identifier for Texinfo content that is only composed of letter,
digits, C<-> and C<_>. In C<Texinfo::Parser> C<normalize_node> is used
for node, floats and anchor names normalization, but also float
types C<@acronym> and C<@abbr> first argument.
It is also possible to transliterate non ascii letters, instead of mangling
them, with C<transliterate_texinfo>, losing the uniqueness feature of
normalized node names.
=head1 METHODS
=over
=item $normalized = normalize_node($tree)
The Texinfo I<$tree> is returned as a string, normalized as described in the
Texinfo manual for HTML Xref.
The result will be poor for Texinfo trees which are not @-command arguments
(on an @-command line or in braces), for instance if the tree contains
C<@node> or block commands.
=item $transliterated = transliterate_texinfo($tree, $no_unidecode)
The Texinfo I<$tree> is returned as a string, with non ascii letters
transliterated as ascii, but otherwise similar with C<normalize_node>
output. If the optional I<$no_unidecode> argument is set, C<Text::Unidecode>
is not used for characters whose transliteration is not built-in.
=back
=head1 AUTHOR
Patrice Dumas, E<lt>pertusus@free.frE<gt>
=head1 COPYRIGHT AND LICENSE
Copyright 2010, 2011, 2012 Free Software Foundation, Inc.
This library is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at
your option) any later version.
=cut