On Tue, May 22, 2007 at 01:37:43PM +0200, Agustin Martin wrote: > Package: linuxdoc-tools > Version: 0.9.21-0.8 > Severity: wishlist > > Hi, > > I have noticed that TOC creation in fmt_txt.pl has some > minor annoyances, > > * Does not work well with chapters (creates numbering after sections) > * Long lines are not wrapped > * Tabs and extra whitespace is not stripped, making the result visually > strange. > > I have been playing with the txt postASP stuff related to TOC creation and > finally mostly rewrite it to my taste after similar ideas with a number of > improvements. In my preliminary tests, things are working with chapters, > long lines are wrapped and a lot of noise (pending tabs,..) is removed > giving a much better look. > > This is still very experimental, but I am attaching a patch showing the > current changes. I will test them extensively in the meantime.
Attaching a new version of the patch thah also handles apostrophes at beginning of line (I am filing a separate bug report about this) and has some cosmetic changes. Cheers, -- Agustin
Index: lib/dist/fmt_txt.pl =================================================================== RCS file: /home/agmartin/CVSROOT/debian/linuxdoc-tools/lib/dist/fmt_txt.pl,v retrieving revision 1.3 retrieving revision 1.1.1.1.2.21 diff -u -r1.3 -r1.1.1.1.2.21 --- lib/dist/fmt_txt.pl 7 May 2007 10:17:00 -0000 1.3 +++ lib/dist/fmt_txt.pl 30 May 2007 09:35:57 -0000 1.1.1.1.2.21 @@ -14,6 +14,7 @@ use File::Copy; use Text::EntityMap; +use Text::Wrap; use LinuxDocTools::CharEnts; use LinuxDocTools::Lang; use LinuxDocTools::Vars; @@ -34,6 +35,30 @@ $Formats{$txt->{NAME}} = $txt; # --------------------------------------------------------------- +sub txt_parse_data +# --------------------------------------------------------------- +# Wrapper to parse_data, removing some things if not in verbatim +# --------------------------------------------------------------- +{ + my $string = shift; + my $verbatim = shift; + my $char_maps = shift; + my $txt_escape = shift; + + die "fmt_txt::txt_parse_data: Bad number of arguments\n" unless $txt_escape; + + $string = &parse_data ($string, $char_maps, $txt_escape); + + unless ( $verbatim ){ + $string =~ s/([^\\])\\n/$1 /g; # No unescaped \n in text + $string =~ s/([^\\])\\011/$1 /g; # No tabulars in text + $string =~ s/\s+/ /g; # + } + + return $string; +} + +# --------------------------------------------------------------- $txt->{preNSGMLS} = sub # --------------------------------------------------------------- # Set correct NsgmlsOpts @@ -47,18 +72,18 @@ $global->{charset} = "latin1" if $global->{charset} eq "latin"; } - # # Is there a cleaner solution than this? Can't do it earlier, # would show up in the help messages... # # the language support ja. # the charset support nippon. - # + $global->{format} = $global->{charset}; $global->{charset} = "nippon" if $global->{language} eq "ja"; $global->{format} = "groff" if $global->{format} eq "ascii"; $global->{format} = "groff" if $global->{format} eq "nippon"; $global->{format} = "groff" if $global->{format} eq "euc-kr"; + $ENV{SGML_SEARCH_PATH} =~ s/txt/$global->{format}/; $Formats{"groff"} = $txt; @@ -80,6 +105,7 @@ $data =~ s|"|\\\&\"|g; # Insert zero-width space in front of " $data =~ s|^\.|\\&.|; # ditto in front of . at start of line + $data =~ s|^\'|\\&\'|; # ditto in front of ' at start of line $data =~ s|\\|\\\\|g; # Escape backslashes return ($data); @@ -88,226 +114,190 @@ # --------------------------------------------------------------- $txt->{preASP} = sub # --------------------------------------------------------------- -# Run the file through the genertoc utility before sgmlsasp. Not needed -# when producing a manpage. A lot of code from FJM, untested by me. +# Pre-process file before sgmlsasp and create a TOC unless producing +# a manpage. Code based in the genertoc utility and in code from FJM. # --------------------------------------------------------------- { - my ($infile, $outfile) = @_; - my (@toc, @lines); - my $char_maps = load_char_maps ('.2tr', [ Text::EntityMap::sdata_dirs() ]); - - if ( $global->{charset} eq "latin1" ){ - $char_maps = load_char_maps ('.2l1tr', [ Text::EntityMap::sdata_dirs() ]); - } + my ($INFILE, $OUTFILE) = @_; + my $char_maps = ( $global->{charset} eq "latin1" ) ? '.2l1tr' : '.2tr'; + # Note: `sdata_dirs' list made an anonymous array to have a single argument + $char_maps = load_char_maps ($char_maps, [ Text::EntityMap::sdata_dirs() ]); if ($txt->{manpage}){ - while (<$infile>){ + while (<$INFILE>){ if ( s/^-// ){ chomp; - print $outfile "-" . &parse_data ($_, $char_maps, $txt_escape) . "\n"; + print $OUTFILE "-" . &parse_data ($_, $char_maps, $txt_escape) . "\n"; } elsif (/^A/) { /^A(\S+) (IMPLIED|CDATA|NOTATION|ENTITY|TOKEN)( (.*))?$/ || die "bad attribute data: $_\n"; my ($name,$type,$value) = ($1,$2,$4); if ($type eq "CDATA"){ - # CDATA attributes get translated also + # CDATA attributes get also translated $value = &parse_data ($value, $char_maps, $txt_escape); } - print $outfile "A$name $type $value\n"; + print $OUTFILE "A$name $type $value\n"; } else { - print $outfile $_; + print $OUTFILE $_; } } return; } - # note the conversion of `sdata_dirs' list to an anonymous array to - # make a single argument - - # - # Build TOC. The file is read into @lines in the meantime, we need to - # traverse it twice. - # - push (@toc, "(HLINE\n"); - push (@toc, ")HLINE\n"); - push (@toc, "(P\n"); - push (@toc, "-" . Xlat ("Table of Contents") . "\n"); - push (@toc, ")P\n"); - push (@toc, "(VERB\n"); - my (@prevheader, @header); - my $appendix = 0; - my $nonprint = 0; - while (<$infile>) - { - push (@lines, $_); - - if (/^\(SECT(.*)/) - { - @prevheader = @header; - @header = @header[0..$1]; - if ($appendix == 1) - { - $header[$1] = "A"; - $appendix = 0; - } else - { - $header[$1]++; - } - } - if (/^\(APPEND(.*)/) - { - $appendix = 1; - } - if (/^\(HEADING/) - { - $_ = <$infile>; - s/\\n/ /g; - push (@lines, $_); - chop; - s/^-//; - $_ = join(".",@header) . " " . $_; - s/\(\\[0-9][0-9][0-9]\)/\\\1/g; - - if (!$#header) - { - # put a newline before top-level sections unless previous was also - # a top level section - $_ = "\\n" . $_ unless (!$#prevheader); - # put a . and a space after top level sections - s/ /. /; -##### $_ = "-" . $_ . "\\n"; - $_ = "-" . $_; - } - else - { - # subsections get indentation matching hierarchy - $_ = "-" . " " x $#header . $_; - } - -# remove tags from a toc - s/\)TT//g; - s/\(TT//g; - s/\)IT//g; - s/\(IT//g; - s/\)EM//g; - s/\(EM//g; - s/\)BF//g; - s/\(BF//g; - s/AID * CDATA.*$//g; - s/\)LABEL//g; - s/\(LABEL//g; - - push(@toc, parse_data ($_, $char_maps, $txt_escape)); - - $_ = <$infile>; - while (!/^\)HEADING/) { - s/\\n/ /g; #### - push(@lines, $_); - chop; - s/^-//; - -# remove tags from a toc - s/\)TT//g; - s/\(TT//g; - s/\)IT//g; - s/\(IT//g; - s/\)EM//g; - s/\(EM//g; - s/\)BF//g; - s/\(BF//g; - s/AID * CDATA.*$//g; - s/\)LABEL//g; - s/\(LABEL//g; - -# remove NIDX, NCDX from a toc entry - if (/^\(NIDX$/ || /^\(NCDX$/) { $nonprint = 1; } - if (/^\)NIDX$/ || /^\)NCDX$/) { $nonprint = 1; } - -# $_ = "-" . $_ . "\\n"; - push(@toc, parse_data ($_, $char_maps, $txt_escape)) - if (! $nonprint); - $_ = <$infile>; - } - s/\\n/ /g; ### - push(@lines, $_); - push(@toc, "\\n\n"); + # --------------------------------- + # Pre-process file and extract TOC info + # --------------------------------- + + my $inheading = 0; + my $headertext = ''; + my $sectionlevel = ''; + my $appendix = 0; + my $txtout = ""; + my $thetoc = ''; + my $chapterskip = 0; + my $verbatim = 0; + my @tocarray = (); + my @header = (); + my @prevheader = (); + + while ( <$INFILE> ) { + if ($inheading){ + next if ( /^(\(|\))(BF|EM|IT|LABEL|TT)/ ); + next if ( /^\)TOC/ ); + + if ( s/^-// ) { # Header text + chomp; + $headertext .= $_; + $headertext .= " "; + } elsif (/^\)HEADING/){ # End of header: Write full header text + $headertext =~ s/[ \n]*$//; + if ( $headertext ) { + $headertext = &txt_parse_data ($headertext, $verbatim, $char_maps, $txt_escape); + $headertext =~ s/^\\n/ /g; # No newlines in header text BOL + } else { + $headertext = " "; + } + $txtout .= "-" . $headertext . "\n"; + push @tocarray, [$sectionlevel, $headertext]; + $inheading = 0; + $sectionlevel = ''; + $txtout .= $_; + } else { # labels and friends: copy to output + $txtout .= $_; } - } - push (@toc, ")VERB\n"); - push (@toc, "(HLINE\n"); - push (@toc, ")HLINE\n"); - - my $inheading = 0; - my $tipo = ''; - for (@lines) - { - if ($inheading) - { - next if (/^\)TT/ || /^\(TT/ || /^\)IT/ || /^\(IT/ || - /^\)EM/ || /^\(EM/ || /^\)BF/ || /^\(BF/); - if (/^-/) - { - $tipo .= $' ; - chop ($tipo); - $tipo .= " " unless $tipo =~ / $/; - } - else - { - $tipo =~ s/ $//; - if ($tipo) - { - print $outfile "-" - . parse_data ($tipo, $char_maps, $txt_escape) - . "\n"; - } - print $outfile $_; - $tipo = ''; - } - if (/^\)HEADING/) - { - $inheading = 0; - } - next; + + } else { # --- Not in heading + if ( s/^-// ) { + chomp; + $txtout .= "-" . &txt_parse_data ($_, $verbatim, $char_maps, $txt_escape) . "\n"; + } elsif (/^A/) { + /^A(\S+) (IMPLIED|CDATA|NOTATION|ENTITY|TOKEN)( (.*))?$/ + || die "bad attribute data: $_\n"; + my ($name,$type,$value) = ($1,$2,$4); + if ($type eq "CDATA") { # CDATA attributes get also translated + $value = &txt_parse_data ($value, $verbatim, $char_maps, $txt_escape); } - if (/^\(HEADING/) - { - # - # Go into heading processing mode. - # - $tipo = ''; - $inheading = 1; + $txtout .= "A$name $type $value\n"; + } elsif (/^\(TOC/) { # Placeholder for TOC + $txtout .= "##TOC##"; + } else { # Nothing below changes output, just info is recorded + if (/^\(HEADING/) { # Go into heading processing mode. + $headertext = ''; + $inheading = 1; + } elsif (/^\(CHAPT/) { + $sectionlevel = 0; + $chapterskip = 1; # Start sectioning with chapter + if ( $appendix ) { + $sectionlevel = "A$sectionlevel"; + $appendix = 0; + } + } elsif (/^\(SECT(.*)/) { + $sectionlevel = $1 ? $1 : 0; + $sectionlevel += $chapterskip; + if ( $appendix ) { + $sectionlevel = "A$sectionlevel"; + $appendix = 0; + } + } elsif (/^\(APPEND(.*)/) { # appendix mode + $appendix = 1; + } elsif (/^\(VERB/) { # verbatim mode + $verbatim = 1; + } elsif (/^\)VERB/){ # end of verbatim + $verbatim = 0; } - if (/^\(TOC/) - { - print $outfile @toc; - next; + $txtout .= $_; + } + } + } # end of while (<$INFILE>) loop + + # ---------------------------- + # Post-process the TOC, if any + # ---------------------------- + + if ( @tocarray ) { + my $toclinelength = 72; # Length of a normal line + @header = @prevheader = (); + $thetoc = join ("\n",("(HLINE", + ")HLINE", + "(P", + "-" . Xlat ("Table of Contents"), + ")P", + "(VERB\n")); + + foreach my $entry ( @tocarray ) { + my $level = $$entry[0]; # Section level + my $text = $$entry[1]; # section entry + my $number = ''; # Numbering of the item + my $nwhite = ''; # Will be length($number) times " " + + $text =~ s/(\(|\))(BF|EM|IT|LABEL|TT)//g; + $text =~ s/AID * CDATA.*$//g; + $text =~ s/\s+/ /g; + + @prevheader = @header; + @header = @header[0..$level]; + + if ( $level =~ s/^A// ){ + $header[$level] = "A"; + } else { + $header[$level]++; + } + + my $number = join ('.',@header); + + if ( ! $#header ) { + # put a . after top level sections + $number .= '.'; + # put a newline before top-level sections unless previous is one + $number = "\\n" . $number unless (!$#prevheader); + $number = "-" . $number; + } else { + # subsections get indentation matching hierarchy + $number = "-" . " " x $#header . $number; + } + unless ( $text =~ /^(\(|\))(NCDX|NIDX)$/ ){ + $nwhite = $number; + $nwhite =~ s/^[-\\n]*//; + $nwhite = "-" . " " x length($nwhite); + $Text::Wrap::columns = $toclinelength - length($nwhite); + foreach ( split("\n",wrap('','',$text)) ){ + $thetoc .= "$number $_\\n\n"; + $number = $nwhite; # Whitespaces if number is already printed } - if (/^-/) - { - my ($str) = $'; - chop ($str); - print $outfile "-" . parse_data ($str, $char_maps, $txt_escape) . "\n"; - next; - } - elsif (/^A/) - { - /^A(\S+) (IMPLIED|CDATA|NOTATION|ENTITY|TOKEN)( (.*))?$/ - || die "bad attribute data: $_\n"; - my ($name,$type,$value) = ($1,$2,$4); - if ($type eq "CDATA") - { - # CDATA attributes get translated also - $value = parse_data ($value, $char_maps, $txt_escape); - } - print $outfile "A$name $type $value\n"; - next; - } - - # - # Default action if not skipped over with next: copy in to out. - # - print $outfile $_; + } } + $thetoc .= join ("\n",(")VERB", + "(HLINE", + ")HLINE\n")); + } # Parsed @tocarray + + if ( $thetoc ){ + $txtout =~ s/^\#\#TOC\#\#/$thetoc/m; + } else { + $txtout =~ s/^\#\#TOC\#\#//m; + } + print $OUTFILE $txtout; + return 0; }; # --------------------------------------------------------------- @@ -332,7 +322,7 @@ } s/^\.[ \t].*/\\\&$&/g; s/\\fC/\\fR/g; - s/^.ft C/.ft R/g; + s/^.ft C/.ft R/g; $txtout .= $_; } @@ -347,14 +337,17 @@ my $groffcommand = "| $main::progs->{GROFF} $global->{pass} -T $global->{charset} -t $main::progs->{GROFFMACRO} > $groffout"; open ( $OUTFILE, $groffcommand ) or die "fmt_txt::postASP: Could not open pipe to groff:\n $groffcommand\n"; + print STDERR "groff_PIPE: $groffcommand\n" + if ( $global->{debug} && exists $ENV{'LDT_DEBUG'} ); } print $OUTFILE $txtout; close $OUTFILE; - # - # Unless making a manpage, a little bit of work is left. - # + die " fmt_txt::postASP: Empty output file, error when calling groff. Aborting...\n" + if ( ! $txt->{manpage} && -z $groffout ); + + # Unless making a manpage, a bit of work is left. unless ( $txt->{manpage} ) { open ( $TXTFILE, "> $txtfile") @@ -365,7 +358,10 @@ if ( $txt->{blanks} ) { # No more than $txt->{blanks} continuous blank lines my $count = 0; - $count = &{$txt->{cutblank}}($count, $TXTFILE, $_) while (<$GROFFOUT>); + while ( <$GROFFOUT> ){ + $count = ( /^$/ ) ? $count + 1 : 0; + print $TXTFILE $_ if ( $count <= $txt->{blanks} ); + } } else { copy ($GROFFOUT, $TXTFILE); } @@ -376,29 +372,12 @@ return 0; }; -# --------------------------------------------------------------- -$txt->{cutblank} = sub -# --------------------------------------------------------------- -# Trim from $in more than $num consecutive blank lines. Write to $out -# --------------------------------------------------------------- -{ - my ($num, $out, $in) = @_; - if ( $in =~ /^$/ ){ - $num++; - } else { - $num = 0; - } - if ( $num <= $txt->{blanks} ){ - print $out $in; - } - return ($num); -}; - # Ensure we evaluate to true. 1; __END__ #Local Variables: -#perl-indent-level: 2 +# mode: perl +# perl-indent-level: 2 #End: