Bug#425537: linuxdoc-tools: Improving TOC creation in fmt_txt.pl

Agustin Martin Wed, 30 May 2007 03:14:06 -0700

On Tue, May 22, 2007 at 01:37:43PM +0200, Agustin Martin wrote:
> Package: linuxdoc-tools
> Version: 0.9.21-0.8
> Severity: wishlist
> 
> Hi,
> 
> I have noticed that TOC creation in fmt_txt.pl has some
> minor annoyances,
> 
> * Does not work well with chapters (creates numbering after sections)
> * Long lines are not wrapped
> * Tabs and extra whitespace is not stripped, making the result visually
>   strange.
> 
> I have been playing with the txt postASP stuff related to TOC creation and
> finally mostly rewrite it to my taste after similar ideas with a number of
> improvements. In my preliminary tests, things are working with chapters,
> long lines are wrapped and a lot of noise (pending tabs,..) is removed
> giving a much better look.
> 
> This is still very experimental, but I am attaching a patch showing the
> current changes. I will test them extensively in the meantime.


Attaching a new version of the patch thah also handles apostrophes at
beginning of line (I am filing a separate bug report about this) and has
some cosmetic changes.

Cheers,

-- 
Agustin

Index: lib/dist/fmt_txt.pl
===================================================================
RCS file: /home/agmartin/CVSROOT/debian/linuxdoc-tools/lib/dist/fmt_txt.pl,v
retrieving revision 1.3
retrieving revision 1.1.1.1.2.21
diff -u -r1.3 -r1.1.1.1.2.21
--- lib/dist/fmt_txt.pl	7 May 2007 10:17:00 -0000	1.3
+++ lib/dist/fmt_txt.pl	30 May 2007 09:35:57 -0000	1.1.1.1.2.21
@@ -14,6 +14,7 @@
 
 use File::Copy;
 use Text::EntityMap;
+use Text::Wrap;
 use LinuxDocTools::CharEnts;
 use LinuxDocTools::Lang;
 use LinuxDocTools::Vars;
@@ -34,6 +35,30 @@
 $Formats{$txt->{NAME}} = $txt;
 
 # ---------------------------------------------------------------
+sub txt_parse_data
+# ---------------------------------------------------------------
+# Wrapper to parse_data, removing some things if not in verbatim
+# ---------------------------------------------------------------
+{
+  my $string     = shift;
+  my $verbatim   = shift;
+  my $char_maps  = shift;
+  my $txt_escape = shift;
+  
+  die "fmt_txt::txt_parse_data: Bad number of arguments\n" unless $txt_escape;
+  
+  $string =  &parse_data ($string, $char_maps, $txt_escape);
+  
+  unless ( $verbatim ){
+    $string =~ s/([^\\])\\n/$1 /g;        # No unescaped \n in text
+    $string =~ s/([^\\])\\011/$1 /g;      # No tabulars in text
+    $string =~ s/\s+/ /g;                 # 
+  }
+  
+  return $string;
+}
+
+# ---------------------------------------------------------------
 $txt->{preNSGMLS} = sub
 # ---------------------------------------------------------------
 #  Set correct NsgmlsOpts
@@ -47,18 +72,18 @@
     $global->{charset}    = "latin1" if $global->{charset} eq "latin";
   }
   
-  #
   #  Is there a cleaner solution than this? Can't do it earlier,
   #  would show up in the help messages...
   #
   #  the language support ja.
   #  the charset  support nippon.
-  #
+
   $global->{format}  = $global->{charset};
   $global->{charset} = "nippon" if $global->{language} eq "ja";
   $global->{format}  = "groff"  if $global->{format} eq "ascii";
   $global->{format}  = "groff"  if $global->{format} eq "nippon";
   $global->{format}  = "groff"  if $global->{format} eq "euc-kr";
+
   $ENV{SGML_SEARCH_PATH} =~ s/txt/$global->{format}/;
   
   $Formats{"groff"}  = $txt;
@@ -80,6 +105,7 @@
   
   $data =~ s|"|\\\&\"|g;   # Insert zero-width space in front of "
   $data =~ s|^\.|\\&.|;	   # ditto in front of . at start of line
+  $data =~ s|^\'|\\&\'|;   # ditto in front of ' at start of line
   $data =~ s|\\|\\\\|g;	   # Escape backslashes
   
   return ($data);
@@ -88,226 +114,190 @@
 # ---------------------------------------------------------------
 $txt->{preASP} = sub
 # ---------------------------------------------------------------
-#  Run the file through the genertoc utility before sgmlsasp. Not needed
-#  when producing a manpage. A lot of code from FJM, untested by me.
+# Pre-process file before sgmlsasp and create a TOC unless producing
+# a manpage. Code based in the genertoc utility and in code from FJM.
 # ---------------------------------------------------------------
 {
-  my ($infile, $outfile) = @_;
-  my (@toc, @lines);
-  my $char_maps = load_char_maps ('.2tr', [ Text::EntityMap::sdata_dirs() ]);
-  
-  if ( $global->{charset} eq "latin1" ){
-    $char_maps = load_char_maps ('.2l1tr', [ Text::EntityMap::sdata_dirs() ]);
-  }
+  my ($INFILE, $OUTFILE) = @_;
+  my $char_maps = ( $global->{charset} eq "latin1" ) ? '.2l1tr' : '.2tr';
+  # Note: `sdata_dirs' list made an anonymous array to have a single argument
+  $char_maps = load_char_maps ($char_maps, [ Text::EntityMap::sdata_dirs() ]);
   
   if ($txt->{manpage}){    
-    while (<$infile>){
+    while (<$INFILE>){
       if ( s/^-// ){
 	chomp;  
-	print $outfile "-" . &parse_data ($_, $char_maps, $txt_escape) . "\n";
+	print $OUTFILE "-" . &parse_data ($_, $char_maps, $txt_escape) . "\n";
       } elsif (/^A/) {
 	/^A(\S+) (IMPLIED|CDATA|NOTATION|ENTITY|TOKEN)( (.*))?$/
 	    || die "bad attribute data: $_\n";
 	my ($name,$type,$value) = ($1,$2,$4);
 	if ($type eq "CDATA"){
-	  # CDATA attributes get translated also
+	  # CDATA attributes get also translated
 	  $value = &parse_data ($value, $char_maps, $txt_escape);
 	}
-	print $outfile "A$name $type $value\n";
+	print $OUTFILE "A$name $type $value\n";
       } else {  
-	print $outfile $_;
+	print $OUTFILE $_;
       }
     }
     return;
   }
   
-  # note the conversion of `sdata_dirs' list to an anonymous array to
-  # make a single argument
-  
-  #
-  #  Build TOC. The file is read into @lines in the meantime, we need to
-  #  traverse it twice.
-  #
-  push (@toc, "(HLINE\n");
-  push (@toc, ")HLINE\n");
-  push (@toc, "(P\n");
-  push (@toc, "-" . Xlat ("Table of Contents") . "\n");
-  push (@toc, ")P\n");
-  push (@toc, "(VERB\n");
-  my (@prevheader, @header);
-  my $appendix = 0;
-  my $nonprint = 0;
-  while (<$infile>)
-    {
-      push (@lines, $_);
-
-      if (/^\(SECT(.*)/) 
-        {
-	  @prevheader = @header;
-	  @header = @header[0..$1];
-	  if ($appendix == 1) 
-            {
-	      $header[$1] = "A";
-	      $appendix = 0;
-            } else 
-            {
-	      $header[$1]++;
-	    }
-        }
-      if (/^\(APPEND(.*)/) 
-        {
-	  $appendix = 1;
-        }
-      if (/^\(HEADING/) 
-        {
-	  $_ = <$infile>;
-	  s/\\n/ /g;
-	  push (@lines, $_);
-	  chop;
-	  s/^-//;
-	  $_ = join(".",@header) . " " . $_;
-	  s/\(\\[0-9][0-9][0-9]\)/\\\1/g;
-
-	  if (!$#header) 
-	    {
-	      # put a newline before top-level sections unless previous was also
-	      # a top level section
-	      $_ = "\\n" . $_ unless (!$#prevheader);
-	      # put a . and a space after top level sections
-	      s/ /. /;
-#####	      $_ = "-" . $_ . "\\n";
-	      $_ = "-" . $_;
-	    } 
-	  else 
-	    {
-	      # subsections get indentation matching hierarchy
-	      $_ = "-" . "   " x $#header . $_;
-	    }
-
-#	remove tags from a toc
-	  s/\)TT//g;
-	  s/\(TT//g;
-	  s/\)IT//g;
-	  s/\(IT//g;
-	  s/\)EM//g;
-	  s/\(EM//g;
-	  s/\)BF//g;
-	  s/\(BF//g;
-	  s/AID * CDATA.*$//g;
-	  s/\)LABEL//g;
-	  s/\(LABEL//g;
-
-	  push(@toc, parse_data ($_, $char_maps, $txt_escape));
-
-	  $_ = <$infile>;
-	  while (!/^\)HEADING/) {
-	    s/\\n/ /g; ####
-	    push(@lines, $_);
-	    chop;
-	    s/^-//;
-
-#	remove tags from a toc
-	    s/\)TT//g;
-	    s/\(TT//g;
-	    s/\)IT//g;
-	    s/\(IT//g;
-	    s/\)EM//g;
-	    s/\(EM//g;
-	    s/\)BF//g;
-	    s/\(BF//g;
-	    s/AID * CDATA.*$//g;
-	    s/\)LABEL//g;
-	    s/\(LABEL//g;
-
-#	remove NIDX, NCDX from a toc entry
-	    if (/^\(NIDX$/ || /^\(NCDX$/) { $nonprint = 1; }
-	    if (/^\)NIDX$/ || /^\)NCDX$/) { $nonprint = 1; }
-
-#	  $_ = "-" . $_ . "\\n";
-	    push(@toc, parse_data ($_, $char_maps, $txt_escape))
-	      if (! $nonprint);
-	    $_ = <$infile>;
-	  }
-	  s/\\n/ /g; ###
-	  push(@lines, $_);
-	  push(@toc, "\\n\n");
+  # ---------------------------------
+  # Pre-process file and extract TOC info
+  # ---------------------------------
+  
+  my $inheading    = 0;
+  my $headertext   = '';
+  my $sectionlevel = '';
+  my $appendix     = 0;
+  my $txtout       = "";
+  my $thetoc       = '';
+  my $chapterskip  = 0;
+  my $verbatim     = 0;
+  my @tocarray     = ();
+  my @header       = ();
+  my @prevheader   = ();
+  
+  while ( <$INFILE> ) {
+    if ($inheading){
+      next if ( /^(\(|\))(BF|EM|IT|LABEL|TT)/ );
+      next if ( /^\)TOC/ );
+      
+      if ( s/^-// ) {                # Header text
+	chomp;
+	$headertext .= $_;
+	$headertext .= " ";
+      } elsif (/^\)HEADING/){        # End of header: Write full header text
+	$headertext =~ s/[ \n]*$//;
+	if ( $headertext ) {
+	  $headertext = &txt_parse_data ($headertext, $verbatim, $char_maps, $txt_escape);
+	  $headertext =~ s/^\\n/ /g; # No newlines in header text BOL
+	} else {
+	  $headertext = " ";
+	}
+	$txtout .= "-" . $headertext . "\n";
+	push @tocarray, [$sectionlevel, $headertext];
+	$inheading    = 0;
+	$sectionlevel = '';
+	$txtout .= $_;
+      } else {                       # labels and friends: copy to output
+	$txtout .= $_; 
       }
-    }
-  push (@toc, ")VERB\n");
-  push (@toc, "(HLINE\n");
-  push (@toc, ")HLINE\n");
-
-  my $inheading = 0;
-  my $tipo = '';
-  for (@lines)
-    {
-      if ($inheading)
-        {
-	  next if (/^\)TT/ || /^\(TT/ || /^\)IT/ || /^\(IT/ ||
-                   /^\)EM/ || /^\(EM/ || /^\)BF/ || /^\(BF/);
-	  if (/^-/) 
-            {
-	      $tipo .=  $' ;
-	      chop ($tipo);
-	      $tipo .= " " unless $tipo =~ / $/;
-	    }
-	  else 
-	    {
-	      $tipo =~ s/ $//;
-	      if ($tipo)
-		{
-		  print $outfile "-"
-		      . parse_data ($tipo, $char_maps, $txt_escape)
-		      . "\n";
-		}
-	      print $outfile $_;
-	      $tipo = '';
-	    }
-	  if (/^\)HEADING/)
-	    {
-	      $inheading = 0;
-            }
-	  next;
+      
+    } else { # --- Not in heading 
+      if ( s/^-// ) {
+	chomp;
+	$txtout .=  "-" . &txt_parse_data ($_, $verbatim, $char_maps, $txt_escape) . "\n";
+      } elsif (/^A/) {
+	/^A(\S+) (IMPLIED|CDATA|NOTATION|ENTITY|TOKEN)( (.*))?$/
+	    || die "bad attribute data: $_\n";
+	my ($name,$type,$value) = ($1,$2,$4);
+	if ($type eq "CDATA") {      # CDATA attributes get also translated
+	  $value = &txt_parse_data ($value, $verbatim, $char_maps, $txt_escape);
 	}
-      if (/^\(HEADING/) 
-        {
-	  #
-	  #  Go into heading processing mode.
-	  #
-	  $tipo = '';
-	  $inheading = 1;
+	$txtout .= "A$name $type $value\n";
+      } elsif (/^\(TOC/) {           # Placeholder for TOC
+	$txtout .= "##TOC##";
+      } else {       # Nothing below changes output, just info is recorded
+	if (/^\(HEADING/) {          #  Go into heading processing mode.
+	  $headertext   = '';
+	  $inheading    = 1;
+	} elsif (/^\(CHAPT/) {
+	  $sectionlevel = 0;
+	  $chapterskip  = 1;         # Start sectioning with chapter
+	  if ( $appendix ) {
+	    $sectionlevel = "A$sectionlevel";
+	    $appendix     = 0;
+	  }
+	} elsif (/^\(SECT(.*)/) {
+	  $sectionlevel = $1 ? $1 : 0;
+	  $sectionlevel += $chapterskip;
+	  if ( $appendix ) {
+	    $sectionlevel = "A$sectionlevel";
+	    $appendix     = 0;
+	  }
+	} elsif (/^\(APPEND(.*)/) {  # appendix mode
+	  $appendix = 1;
+	} elsif (/^\(VERB/) {        # verbatim mode
+	  $verbatim = 1;
+	} elsif (/^\)VERB/){         # end of verbatim
+	  $verbatim = 0;
 	}
-      if (/^\(TOC/)
-        {
-	  print $outfile @toc;
-	  next;
+	$txtout .= $_;
+      }
+    } 
+  } # end of  while (<$INFILE>) loop
+  
+  # ----------------------------
+  # Post-process the TOC, if any
+  # ----------------------------
+  
+  if ( @tocarray ) {
+    my $toclinelength = 72;          # Length of a normal line
+    @header = @prevheader = ();
+    $thetoc = join ("\n",("(HLINE",
+			  ")HLINE",
+			  "(P",
+			  "-" . Xlat ("Table of Contents"),
+			  ")P",
+			  "(VERB\n"));
+    
+    foreach my $entry ( @tocarray ) {
+      my $level  = $$entry[0];       # Section level
+      my $text   = $$entry[1];       # section entry
+      my $number = '';               # Numbering of the item
+      my $nwhite = '';               # Will be length($number) times " "
+      
+      $text =~ s/(\(|\))(BF|EM|IT|LABEL|TT)//g;
+      $text =~ s/AID * CDATA.*$//g;
+      $text =~ s/\s+/ /g;
+      
+      @prevheader = @header;
+      @header     = @header[0..$level];
+      
+      if ( $level =~ s/^A// ){
+	$header[$level] = "A";
+      } else {
+	$header[$level]++;
+      }
+      
+      my $number = join ('.',@header);
+      
+      if ( ! $#header ) {
+	# put a . after top level sections
+	$number .= '.';
+	# put a newline before top-level sections unless previous is one
+	$number = "\\n" . $number unless (!$#prevheader);
+	$number = "-" . $number;
+      } else {
+	# subsections get indentation matching hierarchy
+	$number = "-" . "   " x $#header . $number;
+      }
+      unless ( $text =~ /^(\(|\))(NCDX|NIDX)$/ ){
+	$nwhite = $number;
+	$nwhite =~ s/^[-\\n]*//;
+	$nwhite = "-" . " " x length($nwhite);
+	$Text::Wrap::columns = $toclinelength - length($nwhite);
+	foreach ( split("\n",wrap('','',$text)) ){
+	  $thetoc .= "$number $_\\n\n";
+	  $number = $nwhite;     # Whitespaces if number is already printed
 	}
-      if (/^-/)
-        {
-	  my ($str) = $';
-	  chop ($str);
-	  print $outfile "-" . parse_data ($str, $char_maps, $txt_escape) . "\n";
-	  next;
-        }
-      elsif (/^A/)
-        {
-	  /^A(\S+) (IMPLIED|CDATA|NOTATION|ENTITY|TOKEN)( (.*))?$/
-	      || die "bad attribute data: $_\n";
-	  my ($name,$type,$value) = ($1,$2,$4);
-	  if ($type eq "CDATA")
-	    {
-	      # CDATA attributes get translated also
-	      $value = parse_data ($value, $char_maps, $txt_escape);
-	    }
-	  print $outfile "A$name $type $value\n";
-	  next;
-        }
-
-      #
-      #  Default action if not skipped over with next: copy in to out.
-      #
-      print $outfile $_;
+      }
     }
+    $thetoc .= join ("\n",(")VERB",
+			   "(HLINE",
+			   ")HLINE\n"));
+  } # Parsed @tocarray
+  
+  if ( $thetoc ){
+    $txtout =~ s/^\#\#TOC\#\#/$thetoc/m;
+  } else {
+    $txtout =~ s/^\#\#TOC\#\#//m;
+  }
+  print $OUTFILE $txtout;
+  return 0;
 };
 
 # ---------------------------------------------------------------
@@ -332,7 +322,7 @@
     }
     s/^\.[ \t].*/\\\&$&/g;
     s/\\fC/\\fR/g;
-    s/^.ft C/.ft R/g; 
+    s/^.ft C/.ft R/g;
     $txtout .= $_;
   }
   
@@ -347,14 +337,17 @@
     my $groffcommand = "| $main::progs->{GROFF} $global->{pass} -T $global->{charset} -t $main::progs->{GROFFMACRO} > $groffout";
     open ( $OUTFILE, $groffcommand )
 	or die "fmt_txt::postASP: Could not open pipe to groff:\n  $groffcommand\n";
+    print STDERR "groff_PIPE: $groffcommand\n"
+	if ( $global->{debug} &&  exists $ENV{'LDT_DEBUG'} );
   }
   
   print $OUTFILE $txtout;
   close $OUTFILE;
   
-  #
-  #  Unless making a manpage, a little bit of work is left.
-  #
+  die " fmt_txt::postASP: Empty output file, error when calling groff. Aborting...\n"
+      if ( ! $txt->{manpage} && -z $groffout );
+
+  #  Unless making a manpage, a bit of work is left.
   
   unless ( $txt->{manpage} ) {
     open ( $TXTFILE, "> $txtfile")
@@ -365,7 +358,10 @@
     
     if ( $txt->{blanks} ) { # No more than $txt->{blanks} continuous blank lines
       my $count = 0;
-      $count = &{$txt->{cutblank}}($count, $TXTFILE, $_) while (<$GROFFOUT>);
+      while ( <$GROFFOUT> ){
+	$count = ( /^$/ ) ? $count + 1  : 0;
+	print $TXTFILE $_ if ( $count <= $txt->{blanks} );
+      }
     } else {
       copy ($GROFFOUT, $TXTFILE);
     }
@@ -376,29 +372,12 @@
   return 0;
 };
 
-# ---------------------------------------------------------------
-$txt->{cutblank} = sub
-# ---------------------------------------------------------------
-# Trim from $in more than $num consecutive blank lines. Write to $out
-# ---------------------------------------------------------------
-{
-  my ($num, $out, $in) = @_;
-  if ( $in =~ /^$/ ){
-    $num++;
-  } else {
-    $num = 0;
-  }
-  if ( $num <= $txt->{blanks} ){
-    print $out $in;
-  }
-  return ($num);
-};
-
 # Ensure we evaluate to true.
 1;
 
 __END__
 
 #Local Variables:
-#perl-indent-level: 2
+# mode: perl
+# perl-indent-level: 2
 #End:

Bug#425537: linuxdoc-tools: Improving TOC creation in fmt_txt.pl

Reply via email to