Attached is a patch to implement character class support. This support is currently useless; no attributes can currently be applied to character classes. The syntax is as follows:

classes
        <ClassName> = A B C D E
        <EquivalentClass> = A - E
        <UppercaseAlphabet> = <EquivalentClass> F - Z
        <MostEfficient> = A - Z
        <Identifier> = - A - Z a - z
        <EquivIdentifier> = A - Z - a - z

ClassName and EquivalentClass are equivalent; you may use ranges to store characters, and using ranges is much more efficient than storing individually. This is why MostEfficient is the most efficient, even though it is equivalent to UppercaseAlphabet. Identifier and EquivIdentifier are equivalent; a hyphen that cannot denote a range stands for itself.

Even though character classes are stored in font files, they are properties of the glyphs, not of the fonts. In other words, all instances of the glyph 'A' will have the same attributes. You probably want to put the same classes and attributes in every font file; otherwise, you will get different results based on the order in which fonts are loaded.

The next method of attack is to implement attributes. Since the immediate goal is kinsoku shori handling, I will be investigating the behavior of the cflags request (flags 2 and 4) and how I can apply that to get the desired behavior. I must say that so far, it does not look like kinsoku shori handling is needed; lines cannot be broken around most glyphs, including CJK glyphs. I would appreciate any corrections to this line of reasoning if I am wrong.

--
brian m. carlson / brian with sandals: Houston, Texas, US
+1 713 440 7475 | http://crustytoothpaste.ath.cx/~bmc | My opinion only
a typesetting engine: http://crustytoothpaste.ath.cx/~bmc/code/thwack
OpenPGP: RSA v4 4096b 88AC E9B2 9196 305B A994 7552 F1BA 225C 0223 B187
diff -ur groff.old/src/include/font.h groff/src/include/font.h
--- groff.old/src/include/font.h	2007-11-19 15:43:33.000000000 +0000
+++ groff/src/include/font.h	2007-12-10 01:27:34.000000000 +0000
@@ -19,6 +19,12 @@
 with groff; see the file COPYING.  If not, write to the Free Software
 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
 
+#include <string>
+#include <map>
+#include <vector>
+
+class char_class;
+
 // A function of this type can be registered to define the semantics of
 // arbitrary commands in a font DESC file.
 typedef void (*FONT_COMMAND_HANDLER)(const char *,	// command
@@ -268,6 +274,8 @@
 			// upper1, ... lowerN, upperN, 0 }.
 
 private:
+  std::map<std::string, std::vector<char_class *> > class_map;
+			// A map of names to class objects.
   unsigned ligatures;	// Bit mask of available ligatures.  Used by
 			// has_ligature().
   font_kern_list **kern_hash_table;	// Hash table of kerning pairs. 
@@ -309,6 +317,11 @@
   void extend_ch();
   void compact();
 
+  // These methods add glyphs to character classes.
+  void add_class(const char *, glyph *);
+  void add_class(const char *, glyph *, glyph *);
+  void add_class(const char *, const char *);
+
   void add_kern(glyph *, glyph *, int);	// Add to the kerning table a
 			// kerning amount (arg3) between two given glyphs
 			// (arg1 and arg2).
diff -ur groff.old/src/libs/libgroff/font.cpp groff/src/libs/libgroff/font.cpp
--- groff.old/src/libs/libgroff/font.cpp	2007-11-19 15:43:33.000000000 +0000
+++ groff/src/libs/libgroff/font.cpp	2007-12-10 01:28:05.000000000 +0000
@@ -32,6 +32,7 @@
 #include "font.h"
 #include "unicode.h"
 #include "paper.h"
+#include "classes.h"
 
 const char *const WS = " \t\n\r";
 
@@ -779,6 +780,38 @@
   return 0;
 }
 
+void font::add_class(const char *name, glyph *g)
+{
+  int num = glyph_to_number(g);
+
+  if (num == -1)
+    return;
+
+  single_char_class *ref = new single_char_class(num);
+  class_map[name].push_back(ref);
+}
+
+void font::add_class(const char *name, glyph *g1, glyph *g2)
+{
+  int num1 = glyph_to_number(g1);
+  int num2 = glyph_to_number(g2);
+
+  if ((num1 == -1) || (num2 == -1))
+    return;
+
+  range_char_class *ref = new range_char_class(num1, num2);
+  class_map[name].push_back(ref);
+}
+
+void font::add_class(const char *name, const char *oname)
+{
+  std::vector<char_class *> *vec = &class_map[oname];
+  int nelems = vec->size();
+  for (int i = 0; i < nelems; i++) {
+    class_map[name].push_back((*vec)[i]);
+  }
+}
+
 // If the font can't be found, then if not_found is non-NULL, it will be set
 // to 1 otherwise a message will be printed.
 
@@ -1003,12 +1036,62 @@
 	      copy_entry(number_to_glyph(metric.code), last_glyph);
 	    }
 	  }
+
 	}
 	if (last_glyph == NULL) {
 	  t.error("I didn't seem to find any characters");
 	  return 0;
 	}
       }
+      else if (strcmp(command, "classes") == 0) {
+	if (head_only)
+	  return 1;
+	for (;;) {
+	  if (!t.next()) {
+	    command = 0;
+	    break;
+	  }
+	  char *cname = strtok(t.buf, WS);
+	  if (cname == 0)
+	    continue;
+	  char *equals = strtok(0, WS);
+	  if (equals == 0) {
+	    command = cname;
+	    break;
+	  }
+	  p = strtok(0, WS);
+	  if (p == 0) {
+	    t.error("empty character classes not allowed");
+	    return 0;
+	  }
+	  glyph *g1 = 0, *g2 = 0;
+	  while (p != 0) {
+	    if ((g1 != 0) && (p[0] == '-')) {
+	      p = strtok(0, WS);
+	      if (p == 0) {
+		t.error("incomplete range in class definition");
+		return 0;
+	      }
+	      g2 = name_to_glyph(p);
+	      add_class(cname, g1, g2);
+	      g1 = g2 = 0;
+	    }
+	    else if (g1 != 0) {
+	      add_class(cname, g1);
+	      g1 = 0;
+	    }
+	    if ((p[0] == '<') && (p[strlen(p)-1] == '>')) {
+	      add_class(cname, p);
+	    }
+	    else if (p[0] != '-') {
+	      g1 = name_to_glyph(p);
+	    }
+	    p = strtok(0, WS);
+	  }
+	  if (g1 != 0)
+	    add_class(cname, g1);
+	}
+      }
       else {
 	t.error("unrecognised command `%1' "
 		"after `kernpairs' or `charset' command",
diff -ur groff.old/src/libs/libgroff/Makefile.sub groff/src/libs/libgroff/Makefile.sub
--- groff.old/src/libs/libgroff/Makefile.sub	2007-11-19 15:43:33.000000000 +0000
+++ groff/src/libs/libgroff/Makefile.sub	2007-12-10 01:01:26.000000000 +0000
@@ -5,6 +5,7 @@
 OBJS=\
   assert.$(OBJEXT) \
   change_lf.$(OBJEXT) \
+  classes.$(OBJEXT) \
   cmap.$(OBJEXT) \
   color.$(OBJEXT) \
   cset.$(OBJEXT) \
@@ -55,6 +56,7 @@
 CCSRCS=\
   $(srcdir)/assert.cpp \
   $(srcdir)/change_lf.cpp \
+  $(srcdir)/classes.cpp \
   $(srcdir)/cmap.cpp \
   $(srcdir)/color.cpp \
   $(srcdir)/cset.cpp \
diff -ur groff.old/src/roff/troff/charinfo.h groff/src/roff/troff/charinfo.h
--- groff.old/src/roff/troff/charinfo.h	2007-11-19 15:43:33.000000000 +0000
+++ groff/src/roff/troff/charinfo.h	2007-11-20 21:50:09.000000000 +0000
@@ -20,6 +20,7 @@
 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
 
 class macro;
+class char_class;
 
 class charinfo : glyph {
   static int next_index;
@@ -91,7 +92,7 @@
 };
 
 charinfo *get_charinfo(symbol);
-extern charinfo *charset_table[];
+extern char_class *charset_table[];
 charinfo *get_charinfo_by_number(int);
 
 inline int charinfo::overlaps_horizontally()
diff -ur groff.old/src/roff/troff/env.cpp groff/src/roff/troff/env.cpp
--- groff.old/src/roff/troff/env.cpp	2007-11-19 15:43:33.000000000 +0000
+++ groff/src/roff/troff/env.cpp	2007-11-20 21:59:39.000000000 +0000
@@ -33,6 +33,7 @@
 #include "reg.h"
 #include "font.h"
 #include "charinfo.h"
+#include "classes.h"
 #include "macropath.h"
 #include "input.h"
 #include <math.h>
@@ -657,7 +658,7 @@
   current_tab(TAB_NONE),
   leader_node(0),
   tab_char(0),
-  leader_char(charset_table['.']),
+  leader_char(charset_table['.']->get_charinfo()),
   current_field(0),
   discarding(0),
   spread_flag(0),
@@ -1583,7 +1584,7 @@
   if (has_arg()) {
     node *nd = 0;
     for (int i = '9'; i >= '0'; i--) {
-      node *tem = make_node(charset_table[i], curenv);
+      node *tem = make_node(charset_table[i]->get_charinfo(), curenv);
       if (!tem) {
 	skip_line();
 	return;
diff -ur groff.old/src/roff/troff/input.cpp groff/src/roff/troff/input.cpp
--- groff.old/src/roff/troff/input.cpp	2007-11-19 15:43:33.000000000 +0000
+++ groff/src/roff/troff/input.cpp	2007-11-20 21:58:36.000000000 +0000
@@ -35,6 +35,7 @@
 #include "reg.h"
 #include "font.h"
 #include "charinfo.h"
+#include "classes.h"
 #include "macropath.h"
 #include "input.h"
 #include "defs.h"
@@ -89,7 +90,9 @@
 #ifndef POPEN_MISSING
 char *pipe_command = 0;
 #endif
-charinfo *charset_table[256];
+// The first 256 entries here are the first 256 characters; after that, they are
+// sorted only.
+char_class *charset_table[1024];
 unsigned char hpf_code_table[256];
 
 static int warning_mask = DEFAULT_WARNING_MASK;
@@ -2618,7 +2621,7 @@
 static int transparent_translate(int cc)
 {
   if (!invalid_input_char(cc)) {
-    charinfo *ci = charset_table[cc];
+    charinfo *ci = charset_table[cc]->get_charinfo();
     switch (ci->get_special_translation(1)) {
     case charinfo::TRANSLATE_SPACE:
       return ' ';
@@ -2804,7 +2807,7 @@
 		fprintf(stderr, "found [%c]\n", ch); fflush(stderr);
 	      }
 #endif
-	      curenv->add_char(charset_table[ch]);
+	      curenv->add_char(charset_table[ch]->get_charinfo());
 	      tok.next();
 	      if (tok.type != token::TOKEN_CHAR)
 		break;
@@ -6438,22 +6441,24 @@
 {
   char buf[16];
   strcpy(buf, "char");
+  memset(charset_table, 0, sizeof(charset_table));
   for (int i = 0; i < 256; i++) {
     strcpy(buf + 4, i_to_a(i));
-    charset_table[i] = get_charinfo(symbol(buf));
-    charset_table[i]->set_ascii_code(i);
-    if (csalpha(i))
-      charset_table[i]->set_hyphenation_code(cmlower(i));
-  }
-  charset_table['.']->set_flags(charinfo::ENDS_SENTENCE);
-  charset_table['?']->set_flags(charinfo::ENDS_SENTENCE);
-  charset_table['!']->set_flags(charinfo::ENDS_SENTENCE);
-  charset_table['-']->set_flags(charinfo::BREAK_AFTER);
-  charset_table['"']->set_flags(charinfo::TRANSPARENT);
-  charset_table['\'']->set_flags(charinfo::TRANSPARENT);
-  charset_table[')']->set_flags(charinfo::TRANSPARENT);
-  charset_table[']']->set_flags(charinfo::TRANSPARENT);
-  charset_table['*']->set_flags(charinfo::TRANSPARENT);
+    charset_table[i] = new single_char_class(i);
+    charset_table[i]->set_charinfo(get_charinfo(symbol(buf)));
+    charset_table[i]->get_charinfo()->set_ascii_code(i);
+  if (csalpha(i))
+      charset_table[i]->get_charinfo()->set_hyphenation_code(cmlower(i));
+  }
+  charset_table['.']->get_charinfo()->set_flags(charinfo::ENDS_SENTENCE);
+  charset_table['?']->get_charinfo()->set_flags(charinfo::ENDS_SENTENCE);
+  charset_table['!']->get_charinfo()->set_flags(charinfo::ENDS_SENTENCE);
+  charset_table['-']->get_charinfo()->set_flags(charinfo::BREAK_AFTER);
+  charset_table['"']->get_charinfo()->set_flags(charinfo::TRANSPARENT);
+  charset_table['\'']->get_charinfo()->set_flags(charinfo::TRANSPARENT);
+  charset_table[')']->get_charinfo()->set_flags(charinfo::TRANSPARENT);
+  charset_table[']']->get_charinfo()->set_flags(charinfo::TRANSPARENT);
+  charset_table['*']->get_charinfo()->set_flags(charinfo::TRANSPARENT);
   get_charinfo(symbol("dg"))->set_flags(charinfo::TRANSPARENT);
   get_charinfo(symbol("rq"))->set_flags(charinfo::TRANSPARENT);
   get_charinfo(symbol("em"))->set_flags(charinfo::BREAK_AFTER);
@@ -6464,7 +6469,7 @@
   get_charinfo(symbol("sqrtex"))->set_flags(charinfo::OVERLAPS_HORIZONTALLY);
   get_charinfo(symbol("ru"))->set_flags(charinfo::OVERLAPS_HORIZONTALLY);
   get_charinfo(symbol("br"))->set_flags(charinfo::OVERLAPS_VERTICALLY);
-  page_character = charset_table['%'];
+  page_character = charset_table['%']->get_charinfo();
 }
 
 static void init_hpf_code_table()
@@ -6611,14 +6616,14 @@
 charinfo *token::get_char(int required)
 {
   if (type == TOKEN_CHAR)
-    return charset_table[c];
+    return charset_table[c]->get_charinfo();
   if (type == TOKEN_SPECIAL)
     return get_charinfo(nm);
   if (type == TOKEN_NUMBERED_CHAR)
     return get_charinfo_by_number(val);
   if (type == TOKEN_ESCAPE) {
     if (escape_char != 0)
-      return charset_table[escape_char];
+      return charset_table[escape_char]->get_charinfo();
     else {
       error("`\\e' used while no current escape character");
       return 0;
@@ -6662,14 +6667,15 @@
   node *n = 0;
   switch (type) {
   case TOKEN_CHAR:
-    *pp = (*pp)->add_char(charset_table[c], curenv, &w, &s);
+    *pp = (*pp)->add_char(charset_table[c]->get_charinfo(), curenv, &w, &s);
     break;
   case TOKEN_DUMMY:
     n = new dummy_node;
     break;
   case TOKEN_ESCAPE:
     if (escape_char != 0)
-      *pp = (*pp)->add_char(charset_table[escape_char], curenv, &w, &s);
+      *pp = (*pp)->add_char(charset_table[escape_char]->get_charinfo(), curenv,
+	  &w, &s);
     break;
   case TOKEN_HYPHEN_INDICATOR:
     *pp = (*pp)->add_discretionary_hyphen();
@@ -6734,7 +6740,7 @@
 				      curenv->get_fill_color()));
     break;
   case TOKEN_CHAR:
-    curenv->add_char(charset_table[c]);
+    curenv->add_char(charset_table[c]->get_charinfo());
     break;
   case TOKEN_DUMMY:
     curenv->add_node(new dummy_node);
@@ -6747,7 +6753,7 @@
     break;
   case TOKEN_ESCAPE:
     if (escape_char != 0)
-      curenv->add_char(charset_table[escape_char]);
+      curenv->add_char(charset_table[escape_char]->get_charinfo());
     break;
   case TOKEN_BEGIN_TRAP:
   case TOKEN_END_TRAP:
@@ -8330,7 +8336,7 @@
 {
   charinfo *ci;
   if (nm[1] == 0)
-    ci = charset_table[nm[0] & 0xff];
+    ci = charset_table[nm[0] & 0xff]->get_charinfo();
   else if (nm[0] == '\\' && nm[2] == 0)
     ci = get_charinfo(symbol(nm + 1));
   else
diff -ur groff.old/src/roff/troff/node.cpp groff/src/roff/troff/node.cpp
--- groff.old/src/roff/troff/node.cpp	2007-11-19 15:43:33.000000000 +0000
+++ groff/src/roff/troff/node.cpp	2007-11-20 22:00:47.000000000 +0000
@@ -40,6 +40,7 @@
 #include "reg.h"
 #include "font.h"
 #include "charinfo.h"
+#include "classes.h"
 #include "input.h"
 #include "geometry.h"
 
@@ -6292,7 +6293,7 @@
 
 hunits env_digit_width(environment *env)
 {
-  node *n = make_glyph_node(charset_table['0'], env);
+  node *n = make_glyph_node(charset_table['0']->get_charinfo(), env);
   if (n) {
     hunits x = n->width();
     delete n;
--- groff.old/src/include/classes.h	1970-01-01 00:00:00.000000000 +0000
+++ groff/src/include/classes.h	2007-12-10 01:14:58.000000000 +0000
@@ -0,0 +1,62 @@
+/* This file is in the public domain. */
+
+class charinfo;
+
+class char_class
+{
+  public:
+    virtual bool is_in_class(int c);
+    virtual int lookup_char(int c) = 0;
+    charinfo *get_charinfo();
+    void set_charinfo(charinfo *);
+  protected:
+  private:
+    charinfo *ci;
+};
+
+class single_char_class : public char_class
+{
+  public:
+    single_char_class(int c);
+    int lookup_char(int c);
+  protected:
+  private:
+    int ch;
+};
+
+class range_char_class : public char_class
+{
+  public:
+    range_char_class(int low, int high);
+    int lookup_char(int c);
+  protected:
+  private:
+    int lo, hi;
+};
+
+class ref_char_class : public char_class
+{
+  public:
+    ref_char_class(char_class *klass);
+    int lookup_char(int c);
+    char_class *get_class();
+  protected:
+  private:
+    char_class *ref;
+};
+
+inline bool char_class::is_in_class(int c)
+{
+  return lookup_char(c) == 0;
+}
+
+inline charinfo *char_class::get_charinfo()
+{
+  return ci;
+}
+
+inline void char_class::set_charinfo(charinfo *cis)
+{
+  ci = cis;
+}
+
--- groff.old/src/libs/libgroff/classes.cpp	1970-01-01 00:00:00.000000000 +0000
+++ groff/src/libs/libgroff/classes.cpp	2007-12-10 01:14:43.000000000 +0000
@@ -0,0 +1,48 @@
+/* This file is in the public domain. */
+
+#include "classes.h"
+
+single_char_class::single_char_class(int c) :
+  ch(c)
+{
+}
+
+int single_char_class::lookup_char(int c)
+{
+  if (c < ch)
+    return -1;
+  else if (c > ch)
+    return 1;
+  else
+    return 0;
+}
+
+range_char_class::range_char_class(int low, int high) :
+  lo(low), hi(high)
+{
+}
+
+int range_char_class::lookup_char(int c)
+{
+  if (c < lo)
+    return -1;
+  else if (c > hi)
+    return 1;
+  else
+    return 0;
+}
+
+ref_char_class::ref_char_class(char_class *klass) :
+  ref(klass)
+{
+}
+
+int ref_char_class::lookup_char(int c)
+{
+  return ref->lookup_char(c);
+}
+
+char_class *ref_char_class::get_class()
+{
+  return ref;
+}

Attachment: signature.asc
Description: Digital signature

Reply via email to