| # NodeNameNormalization.pm: output tree as normalized node name. |
| # |
| # Copyright 2010, 2011, 2012 Free Software Foundation, Inc. |
| # |
| # This program is free software; you can redistribute it and/or modify |
| # it under the terms of the GNU General Public License as published by |
| # the Free Software Foundation; either version 3 of the License, |
| # or (at your option) any later version. |
| # |
| # This program is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| # GNU General Public License for more details. |
| # |
| # You should have received a copy of the GNU General Public License |
| # along with this program. If not, see <http://www.gnu.org/licenses/>. |
| # |
| # Original author: Patrice Dumas <pertusus@free.fr> |
| |
| # the rules for conversion are decribed in the Texinfo manual, for |
| # HTML crossrefs. |
| |
| package Texinfo::Convert::NodeNameNormalization; |
| |
| use 5.00405; |
| use strict; |
| |
| use Unicode::Normalize; |
| use Text::Unidecode; |
| # for the accents definition |
| use Texinfo::Common; |
| # reuse some conversion hashes |
| use Texinfo::Convert::Text; |
| # use the hashes and functions |
| use Texinfo::Convert::Unicode; |
| |
| require Exporter; |
| use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); |
| @ISA = qw(Exporter); |
| |
| # Items to export into callers namespace by default. Note: do not export |
| # names by default without a very good reason. Use EXPORT_OK instead. |
| # Do not simply export all your public functions/methods/constants. |
| |
| # This allows declaration use Texinfo::Convert::NodeNameNormalization ':all'; |
| # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK |
| # will save memory. |
| %EXPORT_TAGS = ( 'all' => [ qw( |
| normalize_node |
| transliterate_texinfo |
| ) ] ); |
| |
| @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); |
| |
| @EXPORT = qw( |
| ); |
| |
| my %normalize_node_brace_no_arg_commands |
| = %Texinfo::Convert::Text::text_brace_no_arg_commands; |
| foreach my $command (keys(%Texinfo::Convert::Unicode::unicode_character_brace_no_arg_commands)) { |
| $normalize_node_brace_no_arg_commands{$command} = |
| $Texinfo::Convert::Unicode::unicode_character_brace_no_arg_commands{$command}; |
| } |
| |
| my %normalize_node_no_brace_commands |
| = %Texinfo::Common::no_brace_commands; |
| $normalize_node_no_brace_commands{'*'} = ' '; |
| |
| my %accent_commands = %Texinfo::Common::accent_commands; |
| |
| my %ignored_brace_commands; |
| foreach my $ignored_brace_command (#'xref','ref', 'pxref', 'inforef', |
| 'anchor', 'footnote', 'shortcaption', 'caption', 'hyphenation') { |
| $ignored_brace_commands{$ignored_brace_command} = 1; |
| } |
| |
| my %ignored_types; |
| foreach my $type ('empty_line_after_command', 'preamble', |
| 'empty_spaces_after_command', 'spaces_at_end', |
| 'empty_spaces_before_argument', 'empty_spaces_before_paragraph', |
| 'space_at_end_menu_node', |
| 'empty_spaces_after_close_brace', |
| 'empty_space_at_end_def_bracketed') { |
| $ignored_types{$type} = 1; |
| } |
| |
| |
| sub normalize_node($) |
| { |
| my $root = shift; |
| my $result = convert($root); |
| $result = Unicode::Normalize::NFC($result); |
| $result = _unicode_to_protected($result); |
| $result = 'Top' if ($result =~ /^Top$/i); |
| return $result; |
| } |
| |
| sub transliterate_texinfo($;$) |
| { |
| my $root = shift; |
| my $no_unidecode = shift; |
| my $result = convert($root); |
| $result = Unicode::Normalize::NFC($result); |
| $result = _unicode_to_protected( |
| _unicode_to_transliterate($result, $no_unidecode)); |
| return $result; |
| } |
| |
| sub convert($) |
| { |
| my $root = shift; |
| my $result = _convert($root); |
| } |
| |
| sub _unicode_to_protected($) |
| { |
| my $text = shift; |
| my $result = ''; |
| while ($text ne '') { |
| if ($text =~ s/^([A-Za-z0-9]+)//o) { |
| $result .= $1; |
| } elsif ($text =~ s/^ //o) { |
| $result .= '-'; |
| } elsif ($text =~ s/^(.)//o) { |
| my $char = $1; |
| if (exists($Texinfo::Convert::Unicode::unicode_simple_character_map{$char})) { |
| $result .= '_' . lc($Texinfo::Convert::Unicode::unicode_simple_character_map{$char}); |
| } else { |
| if (ord($char) <= hex(0xFFFF)) { |
| $result .= '_' . lc(sprintf("%04x",ord($char))); |
| } else { |
| $result .= '__' . lc(sprintf("%06x",ord($char))); |
| } |
| } |
| } else { |
| warn "Bug: unknown character _unicode_to_protected (likely in infinite loop)\n"; |
| print STDERR "Text: !!$text!!\n"; |
| sleep 1; |
| } |
| } |
| return $result; |
| } |
| |
| sub _unicode_to_transliterate($;$) |
| { |
| my $text = shift; |
| my $no_unidecode = shift; |
| if (chomp($text)) { |
| warn "Bug: end of line to transliterate: $text\n"; |
| } |
| my $result = ''; |
| while ($text ne '') { |
| if ($text =~ s/^([A-Za-z0-9 ]+)//o) { |
| $result .= $1; |
| } elsif ($text =~ s/^(.)//o) { |
| my $char = $1; |
| if (exists($Texinfo::Convert::Unicode::unicode_simple_character_map{$char})) { |
| $result .= $char; |
| } elsif (ord($char) <= hex(0xFFFF) |
| and exists($Texinfo::Convert::Unicode::transliterate_map{uc(sprintf("%04x",ord($char)))})) { |
| $result .= $Texinfo::Convert::Unicode::transliterate_map{uc(sprintf("%04x",ord($char)))}; |
| } elsif (ord($char) <= hex(0xFFFF) |
| and exists($Texinfo::Convert::Unicode::diacritics_accent_commands{uc(sprintf("%04x",ord($char)))})) { |
| $result .= ''; |
| # in this case, we want to avoid calling unidecode, as we are sure |
| # that there is no useful transliteration of the unicode character |
| # instead we want to keep it as is. |
| # This is the case, for example, for @exclamdown, is corresponds |
| # with x00a1, but unidecode transliterates it to a !, we want |
| # to avoid that and keep x00a1. |
| } elsif (ord($char) <= hex(0xFFFF) |
| and exists($Texinfo::Convert::Unicode::no_transliterate_map{uc(sprintf("%04x",ord($char)))})) { |
| $result .= $char; |
| } else { |
| if ($no_unidecode) { |
| if (ord($char) <= hex(0xFFFF) |
| and exists ($Texinfo::Convert::Unicode::transliterate_accent_map{uc(sprintf("%04x",ord($char)))})) { |
| $result .= $Texinfo::Convert::Unicode::transliterate_accent_map{uc(sprintf("%04x",ord($char)))}; |
| } else { |
| $result .= $char; |
| } |
| } else { |
| $result .= unidecode($char); |
| } |
| } |
| #print STDERR " ($no_unidecode) $text -> CHAR: ".ord($char)." ".uc(sprintf("%04x",ord($char)))."\n$result\n"; |
| } else { |
| warn "Bug: unknown character _unicode_to_transliterate (likely in infinite loop)\n"; |
| print STDERR "Text: !!$text!!\n"; |
| sleep 1; |
| } |
| } |
| return $result; |
| } |
| |
| |
| |
| sub _convert($;$); |
| |
| sub _convert($;$) |
| { |
| my $root = shift; |
| my $in_sc = shift; |
| |
| if (0) { |
| print STDERR "root\n"; |
| print STDERR " Command: $root->{'cmdname'}\n" if ($root->{'cmdname'}); |
| print STDERR " Type: $root->{'type'}\n" if ($root->{'type'}); |
| print STDERR " Text: $root->{'text'}\n" if (defined($root->{'text'})); |
| #print STDERR " Special def_command: $root->{'extra'}->{'def_command'}\n" |
| # if (defined($root->{'extra'}) and $root->{'extra'}->{'def_command'}); |
| } |
| |
| return '' if (($root->{'type'} and $ignored_types{$root->{'type'}}) |
| or ($root->{'cmdname'} |
| and ($ignored_brace_commands{$root->{'cmdname'}} |
| # here ignore the misc commands |
| or ($root->{'args'} and $root->{'args'}->[0] |
| and $root->{'args'}->[0]->{'type'} |
| and ($root->{'args'}->[0]->{'type'} eq 'misc_line_arg' |
| or $root->{'args'}->[0]->{'type'} eq 'misc_arg'))))); |
| my $result = ''; |
| if (defined($root->{'text'})) { |
| $result = $root->{'text'}; |
| $result =~ s/\s+/ /go; |
| $result = uc($result) if ($in_sc); |
| } |
| if ($root->{'cmdname'}) { |
| my $command = $root->{'cmdname'}; |
| if (defined($normalize_node_no_brace_commands{$root->{'cmdname'}})) { |
| return $normalize_node_no_brace_commands{$root->{'cmdname'}}; |
| } elsif (defined($normalize_node_brace_no_arg_commands{$root->{'cmdname'}})) { |
| $command = $root->{'extra'}->{'clickstyle'} |
| if ($root->{'extra'} |
| and defined($root->{'extra'}->{'clickstyle'}) |
| and defined($normalize_node_brace_no_arg_commands{$root->{'extra'}->{'clickstyle'}})); |
| my $result = $normalize_node_brace_no_arg_commands{$command}; |
| if ($in_sc and $Texinfo::Common::letter_no_arg_commands{$command}) { |
| $result = uc($result); |
| } |
| return $result; |
| # commands with braces |
| } elsif ($accent_commands{$root->{'cmdname'}}) { |
| return '' if (!$root->{'args'}); |
| my $accent_text = _convert($root->{'args'}->[0]); |
| my $accented_char |
| = Texinfo::Convert::Unicode::unicode_accent($accent_text, |
| $root); |
| if (!defined($accented_char)) { |
| # In this case, the node normalization do not follow the specification, |
| # but we cannot do better |
| $accented_char = Texinfo::Convert::Text::ascii_accent($accent_text, |
| $root); |
| } |
| if ($in_sc) { |
| return uc ($accented_char); |
| } else { |
| return $accented_char; |
| } |
| #} elsif ($root->{'cmdname'} eq 'image') { |
| # return _convert($root->{'args'}->[0]); |
| } elsif ($Texinfo::Common::ref_commands{$root->{'cmdname'}}) { |
| my @args_try_order; |
| if ($root->{'cmdname'} eq 'inforef') { |
| @args_try_order = (0, 1, 2); |
| } else { |
| @args_try_order = (0, 1, 2, 4, 3); |
| } |
| foreach my $index (@args_try_order) { |
| if (defined($root->{'args'}->[$index])) { |
| my $text = _convert($root->{'args'}->[$index]); |
| return $text if (defined($text) and $text =~ /\S/); |
| } |
| } |
| return ''; |
| #} elsif ($root->{'cmdname'} eq 'email') { |
| # my $mail = _convert($root->{'args'}->[0]); |
| # return $mail if (defined($mail) and $mail ne ''); |
| # my $text; |
| # $text = _convert($root->{'args'}->[1]) |
| # if (defined($root->{'args'}->[1])); |
| # return $text if (defined($text) and ($text ne '')); |
| # #return $mail; |
| # return ''; |
| # Here all the commands with args are processed, if they have |
| # more than one arg the first one is used. |
| } elsif ($root->{'args'} and $root->{'args'}->[0] |
| and (($root->{'args'}->[0]->{'type'} |
| and $root->{'args'}->[0]->{'type'} eq 'brace_command_arg') |
| or $root->{'cmdname'} eq 'math')) { |
| my $sc = 1 if ($root->{'cmdname'} eq 'sc' or $in_sc); |
| return _convert($root->{'args'}->[0], $sc); |
| } |
| } |
| if ($root->{'contents'}) { |
| foreach my $content (@{$root->{'contents'}}) { |
| $result .= _convert($content, $in_sc); |
| } |
| } |
| $result = '{'.$result.'}' |
| if ($root->{'type'} and $root->{'type'} eq 'bracketed'); |
| return $result; |
| } |
| |
| 1; |
| |
| __END__ |
| |
| =head1 NAME |
| |
| Texinfo::Convert::NodeNameNormalization - Normalize and transliterate Texinfo trees |
| |
| =head1 SYNOPSIS |
| |
| use Texinfo::Convert::NodeNameNormalization qw(normalize_node |
| transliterate_texinfo); |
| |
| my $normalized = normalize_node({'contents' => $node_contents}); |
| |
| my $file_name = transliterate_texinfo({'contents' |
| => $section_contents}); |
| |
| =head1 DESCRIPTION |
| |
| Texinfo::Convert::NodeNameNormalization allows to normalize node names, |
| with C<normalize_node> following the specification described in the |
| Texinfo manual for HTML Xref. This is usefull each time one want a |
| unique identifier for Texinfo content that is only composed of letter, |
| digits, C<-> and C<_>. In C<Texinfo::Parser> C<normalize_node> is used |
| for node, floats and anchor names normalization, but also float |
| types C<@acronym> and C<@abbr> first argument. |
| |
| It is also possible to transliterate non ascii letters, instead of mangling |
| them, with C<transliterate_texinfo>, losing the uniqueness feature of |
| normalized node names. |
| |
| =head1 METHODS |
| |
| =over |
| |
| =item $normalized = normalize_node($tree) |
| |
| The Texinfo I<$tree> is returned as a string, normalized as described in the |
| Texinfo manual for HTML Xref. |
| |
| The result will be poor for Texinfo trees which are not @-command arguments |
| (on an @-command line or in braces), for instance if the tree contains |
| C<@node> or block commands. |
| |
| =item $transliterated = transliterate_texinfo($tree, $no_unidecode) |
| |
| The Texinfo I<$tree> is returned as a string, with non ascii letters |
| transliterated as ascii, but otherwise similar with C<normalize_node> |
| output. If the optional I<$no_unidecode> argument is set, C<Text::Unidecode> |
| is not used for characters whose transliteration is not built-in. |
| |
| =back |
| |
| =head1 AUTHOR |
| |
| Patrice Dumas, E<lt>pertusus@free.frE<gt> |
| |
| =head1 COPYRIGHT AND LICENSE |
| |
| Copyright 2010, 2011, 2012 Free Software Foundation, Inc. |
| |
| This library is free software; you can redistribute it and/or modify |
| it under the terms of the GNU General Public License as published by |
| the Free Software Foundation; either version 3 of the License, or (at |
| your option) any later version. |
| |
| =cut |