Thanks for explaining. I still see a problem with the proposed patch, though, in that (if I'm understanding it correctly) it would cause c_isalpha (120) to succeed, even though EBCDIC 120 corresponds to U+00CC LATIN CAPITAL LETTER I WITH GRAVE, and that is not supposed to be an alphabetic character in the stripped-down C locale. Code that uses c-ctype wants only ASCII letters, and departing from this would likely break things.

Worse, the C expression "c_ispunct ('[')" might return false, as the library may be in a locale that's incompatible with the mode the compiler was in when it compiled the '['.

Looking at the web page you mentioned, it appears that one approach is to assume EBCDIC 1047 (this seems to be the default and typical setting for C programs) at both compile-time and run-time. We can check the compile-time assumption without any code overhead. The proposed patch does that. If someone ally wants to use a different code page, either at compile-time or at run-time, more code will need to be written (most likely by the poor soul who actually needs that feature).

Yes, all control characters appear to be in [\x00-\x3F], but not everything in that range is a control character. (I remember 0x04 was not.) I tried making c_iscntrl() a simple range check at first, but that did not agree with the system iscntrl().

Thanks, this should be fixed in the attached patch, which I've installed.
>From a92ab221b5cad8a5c1a5ca1fc1823d1f3fe4a24b Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Tue, 22 Sep 2015 14:47:06 -0700
Subject: [PATCH] c-ctype: assume EBCDIC 1047 for c_iscntrl

* lib/c-ctype.c (c_iscntrl): When EBCDIC, assume code page 1047 at
both compile-time and at run-time.  Check it at compile-time.  We can
worry about other code pages later, if the topic ever comes up.
Fix typo in C_CTYPE_EBCDIC.
---
 lib/c-ctype.c | 38 +++++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/lib/c-ctype.c b/lib/c-ctype.c
index 916d46e..558c4af 100644
--- a/lib/c-ctype.c
+++ b/lib/c-ctype.c
@@ -131,17 +131,37 @@ c_isblank (int c)
 bool
 c_iscntrl (int c)
 {
-  enum { C_CTYPE_EBCDIC = (' ' == 64 && '0' == 240
-                           && 'A' == 193 && 'J' == 209 && 'S' == 226
-                           && 'A' == 129 && 'J' == 145 && 'S' == 162) };
-  verify (C_CTYPE_ASCII || C_CTYPE_EBCDIC);
-
-  if (0 <= c && c < ' ')
-    return true;
+  enum { C_CTYPE_EBCDIC = (' ' == '\x40' && '0' == '\xf0'
+                           && 'A' == '\xc1' && 'J' == '\xd1' && 'S' == '\xe2'
+                           && 'a' == '\x81' && 'j' == '\x91' && 's' == '\xa2') };
   if (C_CTYPE_ASCII)
-    return c == 0x7f;
+    return (0 <= c && c < ' ') || c == 0x7f;
   else
-    return c == 0xff || c == -1;
+    {
+      /* Return true if C corresponds to an ASCII control character.
+         Assume EBCDIC code page 1047, and verify that the compiler
+         agrees with this.  */
+      verify (C_CTYPE_ASCII
+              || (C_CTYPE_EBCDIC
+                  && '!' == '\x5a' && '#' == '\x7b' && '$' == '\x5b'
+                  && '@' == '\x7c' && '[' == '\xad' && '\\' == '\xe0'
+                  && ']' == '\xbd' && '^' == '\x5f' && '_' == '\x6d'
+                  && '`' == '\x79'));
+      switch (c)
+        {
+        case '\x00': case '\x01': case '\x02': case '\x03': case '\x05':
+        case '\x0b': case '\x0c': case '\x0d': case '\x0e': case '\x0f':
+        case '\x10': case '\x11': case '\x12': case '\x13': case '\x15':
+        case '\x16': case '\x18': case '\x19': case '\x1c': case '\x1d':
+        case '\x1e': case '\x1f': case '\x26': case '\x27': case '\x2d':
+        case '\x2e': case '\x2f': case '\x32': case '\x37': case '\x3c':
+        case '\x3d': case '\x3f': case '\xff':
+        case '\xff' < 0 ? 0xff : -1:
+          return true;
+        default:
+          return false;
+        }
+    }
 }
 
 bool
-- 
2.1.0

Reply via email to