blob: 9be44a6c5f25c3eecb2b833a28c9fd3813fc6f04 [file] [log] [blame]
# Line.pm: handle line of text.
#
# Copyright 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License,
# or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Original author: Patrice Dumas <pertusus@free.fr>
# this module has nothing Texinfo specific. It is similar with
# Texinfo::Convert::Paragraph, but simpler.
# The delay to output a word is here to be able to detect when an upper
# case letter is before an end of line
package Texinfo::Convert::Line;
use 5.006;
use strict;
use Unicode::EastAsianWidth;
# initialize a line object.
sub new($;$)
{
my $class = shift;
my $conf = shift;
my $self = {'indent_length' => 0, 'counter' => 0,
'space' => '', 'frenchspacing' => 0, 'line_beginning' => 1,
'lines_counter' => 0, 'end_line_count' => 0};
if (defined($conf)) {
foreach my $key (keys(%$conf)) {
if ($key eq 'text') {
$self->{'counter'} = Texinfo::Convert::Unicode::string_width($conf->{$key});
$self->{'line_beginning'} = 0 if ($self->{'counter'});
} else {
$self->{$key} = $conf->{$key};
}
}
}
bless $self, $class;
}
# for debug
sub dump($)
{
my $self = shift;
my $word = 'UNDEF';
if (defined($self->{'word'})) {
$word = $self->{'word'};
}
my $end_sentence = 'UNDEF';
$end_sentence = $self->{'end_sentence'} if (defined($self->{'end_sentence'}));
print STDERR "line ($self->{'line_beginning'},$self->{'counter'}) word: $word, space `$self->{'space'}' end_sentence: $end_sentence\n";
}
sub end_line($)
{
my $line = shift;
$line->{'end_line_count'} = 0;
return $line->_end_line();
}
# end a line.
sub _end_line($)
{
my $line = shift;
my $result = $line->_add_pending_word();
$line->{'line_beginning'} = 1;
$line->{'space'} = '';
$line->{'lines_counter'}++;
$line->{'end_line_count'}++;
$line->{'counter'} = 0;
print STDERR "END_LINE.L\n" if ($line->{'DEBUG'});
return "$result\n";
}
sub end_line_count($)
{
my $line = shift;
return $line->{'end_line_count'};
}
sub get_pending($)
{
my $line = shift;
my $result = '';
if ($line->{'space'}) {
$result .= $line->{'space'};
}
if (defined($line->{'word'})) {
$result .= $line->{'word'};
}
return $result;
}
sub add_pending_word($;$)
{
my $line = shift;
my $add_spaces = shift;
$line->{'end_line_count'} = 0;
return $line->_add_pending_word($add_spaces);
}
# put a pending word and spaces in the result string.
sub _add_pending_word($)
{
my $line = shift;
my $add_spaces = shift;
my $result = '';
if (defined($line->{'word'}) or $add_spaces) {
if ($line->{'line_beginning'}) {
if ($line->{'indent_length'}) {
$result .= ' ' x ($line->{'indent_length'} - $line->{'counter'});
print STDERR "INDENT.L($line->{'counter'})\n" if ($line->{'DEBUG'});
}
$line->{'line_beginning'} = 0;
} elsif ($line->{'space'}) {
$result .= $line->{'space'};
print STDERR "ADD_SPACES.L\n" if ($line->{'DEBUG'});
}
$line->{'space'} = '';
if (defined($line->{'word'})) {
$result .= $line->{'word'};
print STDERR "ADD_WORD.L[$line->{'word'}]\n" if ($line->{'DEBUG'});
$line->{'word'} = undef;
$line->{'last_char'} = undef;
}
}
return $result;
}
# end a line
sub end($)
{
my $line = shift;
$line->{'end_line_count'} = 0;
my $result = $line->_add_pending_word();
$result .= $line->{'space'};
print STDERR "END_LINE.L\n" if ($line->{'DEBUG'});
return $result;
}
sub add_next($;$$)
{
my $line = shift;
my $word = shift;
my $transparent = shift;
$line->{'end_line_count'} = 0;
return $line->_add_next($word, $transparent);
}
my $end_sentence_character = quotemeta('.?!');
my $after_punctuation_characters = quotemeta('"\')]');
# add a word
sub _add_next($;$$)
{
my $line = shift;
my $word = shift;
my $transparent = shift;
my $result = '';
if (defined($word)) {
my $disinhibit; # full stop after capital letter ends sentence
if ($word =~ s/\x08$//) {
$disinhibit = 1;
}
if (!defined($line->{'word'})) {
$line->{'word'} = '';
$line->{'last_char'} = '';
if ($line->{'end_sentence'}
and $line->{'end_sentence'} > 0
and !$line->{'frenchspacing'}
and !$line->{'line_beginning'} and $line->{'space'}) {
if ($word !~ /^\s/) {
$line->{'space'} .= ' ' x (2 - length($line->{'space'}));
}
delete $line->{'end_sentence'};
}
}
$line->{'word'} .= $word;
if (!$transparent) {
if ($disinhibit) {
$line->{'last_char'} = 'a';
} elsif ($word =~
/([^$end_sentence_character$after_punctuation_characters])
[$end_sentence_character$after_punctuation_characters]*$/x) {
# Save the last character in $word before punctuation
$line->{'last_char'} = $1;
}
}
if ($line->{'DEBUG'}) {
print STDERR "WORD+.L $word -> $line->{'word'}\n";
}
}
return $result;
}
sub remove_end_sentence($)
{
my $line = shift;
$line->{'end_sentence'} = 0;
}
sub add_end_sentence($;$)
{
my $line = shift;
my $value = shift;
$line->{'end_sentence'} = $value;
}
sub allow_end_sentence($)
{
my $line = shift;
$line->{'last_char'} = 'a'; # lower-case
}
sub set_space_protection($$;$$$)
{
my $line = shift;
my $space_protection = shift;
my $ignore_columns = shift;
my $keep_end_lines = shift;
my $frenchspacing = shift;
$line->{'protect_spaces'} = $space_protection
if defined($space_protection);
$line->{'ignore_columns'} = $ignore_columns
if defined($ignore_columns);
# a no-op in fact
$line->{'keep_end_lines'} = $keep_end_lines
if defined($keep_end_lines);
if (!$line->{'frenchspacing'} and $frenchspacing
and $line->{'end_sentence'} and !$line->{'line_beginning'}
and $line->{'space'} and !defined($line->{'word'})) {
$line->{'space'} .= ' ' x (2 - length($line->{'space'}));
print STDERR "SWITCH.L frenchspacing end sentence space\n" if ($line->{'DEBUG'});
delete $line->{'end_sentence'};
}
$line->{'frenchspacing'} = $frenchspacing
if defined($frenchspacing);
# begin a word, to have something even if empty
if ($space_protection) {
$line->_add_next('');
}
return '';
}
# wrap a text.
sub add_text($$)
{
my $line = shift;
my $text = shift;
$line->{'end_line_count'} = 0;
my $result = '';
while ($text ne '') {
if ($line->{'DEBUG'}) {
my $word = 'UNDEF';
$word = $line->{'word'} if (defined($line->{'word'}));
print STDERR "s `$line->{'space'}', w `$word'\n";
}
# \x{202f}\x{00a0} are non breaking spaces
if ($text =~ s/^([^\S\x{202f}\x{00a0}\n]+)//) {
my $spaces = $1;
print STDERR "SPACES.L\n" if ($line->{'DEBUG'});
if ($line->{'protect_spaces'}) {
$line->{'word'} .= $spaces;
$line->{'last_char'} = substr($spaces, -1);
} else {
my $added_word = $line->{'word'};
$result .= $line->_add_pending_word();
if (!$line->{'line_beginning'}) {
if (!$line->{'frenchspacing'}
and $line->{'end_sentence'}
and $line->{'end_sentence'} > 0) {
if (length($line->{'space'}) >= 1 or length($spaces) > 1) {
my $all_spaces = substr($line->{'space'} . $spaces, 0, 2);
$all_spaces =~ s/[\n\r]/ /g;
$all_spaces .= ' ' x (2 - length($all_spaces));
$line->{'space'} = $all_spaces;
delete $line->{'end_sentence'};
} else {
my $new_space = $spaces;
$new_space =~ s/^[\n\r]/ /;
$line->{'space'} = $new_space;
}
} else {
my $new_space = substr($spaces, 0, 1);
$new_space =~ s/^[\n\r]/ /;
$line->{'space'} = $new_space;
}
}
}
} elsif ($text =~ s/^(([^\s\p{InFullwidth}]|[\x{202f}\x{00a0}])+)//) {
my $added_word = $1;
# Whether a sentence end is permitted in spite of a preceding
# upper case letter.
my $disinhibit = 0;
# Reverse the insertion of the control character in Plaintext.pm.
if ($added_word =~ s/\x08(?=[$end_sentence_character]
[$after_punctuation_characters]*$)//x) {
$disinhibit = 0;
}
$result .= _add_next($line, $added_word);
# Check if it is considered as an end of sentence. There are two things
# to check: one, that we have a ., ! or ?; and second, that it is not
# preceded by an upper-case letter (ignoring some punctuation)
if (defined($line->{'end_sentence'})
and $added_word =~ /^[$after_punctuation_characters]*$/) {
# do nothing in the case of a continuation of
# after_punctuation_characters
} elsif (($disinhibit
or !$line->{'last_char'}
or $line->{'last_char'} !~ /[[:upper:]]/)
and $added_word =~ /[$end_sentence_character]
[$after_punctuation_characters]*$/x) {
if ($line->{'frenchspacing'}) {
$line->{'end_sentence'} = -1;
} else {
$line->{'end_sentence'} = 1;
}
print STDERR "END_SENTENCE.L\n" if ($line->{'DEBUG'});
} else {
print STDERR "delete END_SENTENCE.L($line->{'end_sentence'}): text\n"
if (defined($line->{'end_sentence'}) and $line->{'DEBUG'});
delete $line->{'end_sentence'};
}
} elsif ($text =~ s/^\n//) {
$result .= $line->_end_line();
} elsif ($text =~ s/^(\p{InFullwidth})//) {
my $added = $1;
print STDERR "EAST_ASIAN.L\n" if ($line->{'DEBUG'});
if (!defined($line->{'word'})) {
$line->{'word'} = '';
}
$line->{'word'} .= $added;
$line->{'last_char'} = $added;
$result .= $line->_add_pending_word();
delete $line->{'end_sentence'};
$line->{'space'} = '';
} else {
# Some characters are not handled by the cases above.
# For example, it happened for strange caracters that seems to be
# some special spaces. It is a bit strange since the cases above
# include a possibility and the complement. Maybe a character
# invalid in a given encoding?
#die "Unknown caracter leading $text";
last;
}
}
return $result;
}
1;