KO Myung-Hun <kom...@gmail.com> writes: > Thanks. ^^ And fixed typo, cp1361 to cp1381.
Oops, thanks for fixing. >> By the way, it's tempting to call DosQueryCp if a charset is omitted >> from the locale name, to avoid maintaining the default mapping >> ourselves. I'd rather not do that for now, but is it feasible? > > This was my first idea. But Bruno rejected. See > > http://lists.gnu.org/archive/html/bug-gnu-libiconv/2011-03/msg00000.html Thanks for the link. If I understand correctly, the main point seems that the patch affects the codeset of the POSIX locale ("C" or "POSIX"). For other locale values, POSIX says: If the locale value has the form: language[_territory][.codeset] it refers to an implementation-provided locale, where settings of language, territory, and codeset are implementation-defined. So, I don't see any problem using system's codepage as a default codeset of a language_territory locale (though it might conflict with the libiconv design). FWIW, kLIBC's nl_langinfo/setlocale implementation does this: - if codeset is specified as part of the locale name, use it - if the locale name is "C" or "POSIX", use "US-ASCII" - otherwise, fallback to DosQueryCp I'm attaching two patches: the first one is an update of the patch we are currently working on (I added mappings from the "unusable" codepages to their equivalents), and the second one is an alternative implementation following the kLIBC implementation. Regards, -- Daiki Ueno
>From 809f9fa29af26293e811b7c27116e6ff65fba526 Mon Sep 17 00:00:00 2001 From: KO Myung-Hun <k...@chollian.net> Date: Thu, 23 Feb 2012 22:37:21 +0900 Subject: [PATCH] localcharset: embed charset aliases for OS/2 Embed the contents of charset.alias file to avoid the troubles finding a separate file, like Windows. * lib/config.charset: Remove os2* from case "$os" in * lib/localcharset.c (get_charset_aliases) [OS2]: Use embedded encoding aliases on OS/2. --- lib/config.charset | 4 +-- lib/localcharset.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 74 insertions(+), 5 deletions(-) diff --git a/lib/config.charset b/lib/config.charset index 4e4c7ed..3e6c88f 100644 --- a/lib/config.charset +++ b/lib/config.charset @@ -348,12 +348,10 @@ case "$os" in #echo "sun_eu_greek ?" # what is this? echo "UTF-8 UTF-8" ;; - freebsd* | os2*) + freebsd*) # FreeBSD 4.2 doesn't have nl_langinfo(CODESET); therefore # localcharset.c falls back to using the full locale name # from the environment variables. - # Likewise for OS/2. OS/2 has XFree86 just like FreeBSD. Just - # reuse FreeBSD's locale data for OS/2. echo "C ASCII" echo "US-ASCII ASCII" for l in la_LN lt_LN; do diff --git a/lib/localcharset.c b/lib/localcharset.c index 1c17af0..9c465bc 100644 --- a/lib/localcharset.c +++ b/lib/localcharset.c @@ -128,7 +128,7 @@ get_charset_aliases (void) cp = charset_aliases; if (cp == NULL) { -#if !(defined DARWIN7 || defined VMS || defined WINDOWS_NATIVE || defined __CYGWIN__) +#if !(defined DARWIN7 || defined VMS || defined WINDOWS_NATIVE || defined __CYGWIN__ || defined OS2) const char *dir; const char *base = "charset.alias"; char *file_name; @@ -342,6 +342,77 @@ get_charset_aliases (void) "CP54936" "\0" "GB18030" "\0" "CP65001" "\0" "UTF-8" "\0"; # endif +# if defined OS2 + /* To avoid the troubles of installing a separate file in the same + directory as the DLL and of retrieving the DLL's directory at + runtime, simply inline the aliases here. */ + + /* On OS/2, a charset is usually omitted from the locale name + (for example, users set LANG just to ko_KR for Korean), thus + we define the default mapping from a locale name to charset. + The list of encodings is taken from the OS/2 documentation + CMDREF.INF (CODEPAGE and COUNTRY entries), see also + <http://www.borgendale.com/locale.htm>. + Since OS/2 codepages CP915, CP921, and CP1381 are not defined + in libiconv, map them to their equivalents. */ + /* Pri, Alt, Etc */ + cp = "ar_AA" "\0" "CP864" "\0" /* 864, 850, 437 */ + "bg_BG" "\0" "ISO-8859-5" "\0" /* 915, 850, 855 */ + "ca_ES" "\0" "CP850" "\0" /* */ + "cs_SZ" "\0" "CP852" "\0" /* 852, 850 */ + "da_DK" "\0" "CP850" "\0" /* 850, 865, 1004 */ + "de_AT" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "de_CH" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "de_DE" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "el_GR" "\0" "CP869" "\0" /* 869, 850, 813 */ + "en_AU" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "en_CA" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "en_GB" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "en_IE" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "en_NZ" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "en_US" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "en_ZA" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "es_ES" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "es_LA" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "et_EE" "\0" "CP922" "\0" /* 922, 850 */ + "fi_FI" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "fr_BE" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "fr_CA" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "fr_CH" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "fr_FR" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "hr_HR" "\0" "CP852" "\0" /* 852, 850 */ + "hu_HU" "\0" "CP852" "\0" /* 852, 850, 1004 */ + "is_IS" "\0" "CP850" "\0" /* 850, 861, 1004 */ + "it_CH" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "it_IT" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "iw_IL" "\0" "CP862" "\0" /* 862, 850, 437 */ + "ja_JP" "\0" "CP943" "\0" /* 943, 850, 942 */ + "ko_KR" "\0" "CP949" "\0" /* 949, 850, 944 */ + "lt_LT" "\0" "ISO-8859-13" "\0" /* 921, 850 */ + "lv_LV" "\0" "ISO-8859-13" "\0" /* 921, 850 */ + "mk_MK" "\0" "CP855" "\0" /* 855, 850, 915 */ + "nl_BE" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "nl_NL" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "no_NO" "\0" "CP850" "\0" /* 850, 865, 1004 */ + "pl_PL" "\0" "CP852" "\0" /* 852, 850 */ + "pt_BR" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "pt_PT" "\0" "CP850" "\0" /* 850, 860, 1004 */ + "ro_RO" "\0" "CP852" "\0" /* 852, 850, 1004 */ + "ru_RU" "\0" "CP866" "\0" /* 866, 850, 915 */ + "sh_BA" "\0" "CP852" "\0" /* 852, 850 */ + "sk_SK" "\0" "CP852" "\0" /* 852, 850 */ + "sl_SI" "\0" "CP852" "\0" /* 852, 850 */ + "sq_AL" "\0" "CP850" "\0" /* 850, 437 */ + "sr_SP" "\0" "CP855" "\0" /* 855, 850, 915 */ + "sv_SE" "\0" "CP850" "\0" /* 850, 437, 1004 */ + "th_TH" "\0" "CP874" "\0" /* 874, 850 */ + "tr_TR" "\0" "CP857" "\0" /* 857, 850, 1004 */ + "zh_CN" "\0" "GB2312" "\0" /* 1381, 850, 946 */ + "zh_TW" "\0" "CP950" "\0" /* 950, 850, 948 */ + "CP915" "\0" "ISO-8859-5" "\0" + "CP921" "\0" "ISO-8859-13" "\0" + "CP1381" "\0" "GB2312" "\0"; +# endif #endif charset_aliases = cp; @@ -530,7 +601,7 @@ locale_charset (void) } } - /* Resolve through the charset.alias file. */ + /* Resolve through aliases. */ codeset = locale; } else -- 2.1.0
>From 68a1c52bcf0717a3823b3365702e68e676b60163 Mon Sep 17 00:00:00 2001 From: KO Myung-Hun <k...@chollian.net> Date: Thu, 23 Feb 2012 22:37:21 +0900 Subject: [PATCH] localcharset: improve charset detection for OS/2 Use system's codepage if possible. Embed the contents of charset.alias file to avoid the troubles finding a separate file, like Windows. * lib/config.charset: Remove os2* from case "$os" in * lib/localcharset.c (get_charset_aliases) [OS2]: Use embedded encoding aliases on OS/2. (locale_charset) [OS2]: Use system's codepage if the locale name is not "C" nor "POSIX". Resolve unusable codepages through aliases. --- lib/config.charset | 4 +--- lib/localcharset.c | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/lib/config.charset b/lib/config.charset index 4e4c7ed..3e6c88f 100644 --- a/lib/config.charset +++ b/lib/config.charset @@ -348,12 +348,10 @@ case "$os" in #echo "sun_eu_greek ?" # what is this? echo "UTF-8 UTF-8" ;; - freebsd* | os2*) + freebsd*) # FreeBSD 4.2 doesn't have nl_langinfo(CODESET); therefore # localcharset.c falls back to using the full locale name # from the environment variables. - # Likewise for OS/2. OS/2 has XFree86 just like FreeBSD. Just - # reuse FreeBSD's locale data for OS/2. echo "C ASCII" echo "US-ASCII ASCII" for l in la_LN lt_LN; do diff --git a/lib/localcharset.c b/lib/localcharset.c index 1c17af0..286a780 100644 --- a/lib/localcharset.c +++ b/lib/localcharset.c @@ -128,7 +128,7 @@ get_charset_aliases (void) cp = charset_aliases; if (cp == NULL) { -#if !(defined DARWIN7 || defined VMS || defined WINDOWS_NATIVE || defined __CYGWIN__) +#if !(defined DARWIN7 || defined VMS || defined WINDOWS_NATIVE || defined __CYGWIN__ || defined OS2) const char *dir; const char *base = "charset.alias"; char *file_name; @@ -342,6 +342,15 @@ get_charset_aliases (void) "CP54936" "\0" "GB18030" "\0" "CP65001" "\0" "UTF-8" "\0"; # endif +# if defined OS2 + /* To avoid the troubles of installing a separate file in the same + directory as the DLL and of retrieving the DLL's directory at + runtime, simply inline the aliases here. */ + + cp = "CP915" "\0" "ISO-8859-5" "\0" + "CP921" "\0" "ISO-8859-13" "\0" + "CP1381" "\0" "GB2312" "\0"; +# endif #endif charset_aliases = cp; @@ -362,7 +371,7 @@ STATIC const char * locale_charset (void) { - const char *codeset; + const char *codeset = NULL; const char *aliases; #if !(defined WINDOWS_NATIVE || defined OS2) @@ -530,10 +539,12 @@ locale_charset (void) } } - /* Resolve through the charset.alias file. */ - codeset = locale; + /* For the POSIX locale, don't use the system's codepage. */ + if (strcmp (locale, "C") == 0 || strcmp (locale, "POSIX") == 0) + codeset = ""; } - else + + if (codeset == NULL) { /* OS/2 has a function returning the locale's codepage as a number. */ if (DosQueryCp (sizeof (cp), cp, &cplen)) -- 2.1.0