I submitted these changes to the libunistring mailing list, but was advised to send them here, as libunistring appears to be a collection of Gnulib modules. So here I am, repeating what I sent to libunistring list. (The patch below is relative to the latest Gnulib master.)
Libunistring on MS-Windows is currently less useful than it could be. In a nutshell, it supports only the default system locale; setting the locale to anything else using 'setlocale' will cause many libunistring functions to fail. One reason is that libunistring needs to know the codeset of the locale, to use it when it converts strings to and from Unicode using iconv. However, on Windows the function locale_charset always returns the default ANSI codepage, using the GetACP API. That API always returns the same codepage regardless of locale changes. The result is that when libunistring is passed a string in any encoding incompatible with the default system codepage, any calls to libiconv will fail with EILSEQ. And since most libunistring functions call libiconv, they all fail. The second problem that gets in the way is the one in gl_locale_name. On MS-Windows, this function returns only the default locale (by calling GetThreadLocale, which is again oblivious to any locale changes made by 'setlocale'). So again, only the default locale can be supported correctly. E.g., functions that need to apply language-specific rules, like for Turkish, will work incorrectly because the language they get from gl_locale_name is not Turkish, even though 'setlocale' was previously called to set the Turkish locale. To fix these problems, I propose the following changes. They have been extensively tested using Guile's test suite, in particular the i18n.test there. I hope these changes will be accepted, because as I said, without them libunistring is much less useful on MS-Windows than it could be. Thanks. 2014-07-02 Eli Zaretskii <e...@gnu.org> Improve support for non-default locales on MS-Windows. * lib/localcharset.c (locale_charset) [WINDOWS_NATIVE]: Before falling back on the default system codepage, try extracting the codepage from what 'setlocale' returns. This allows to take into account changes of the codeset due to non-default locale set by a previous call to 'setlocale'. * lib/localename.c (LOCALE_NAME_MAX_LENGTH) [WINDOWS_NATIVE]: Define if not already defined. (enum_locales_fn, get_lcid) [WINDOWS_NATIVE]: New functions. (gl_locale_name_thread) [WINDOWS_NATIVE]: Produce the current locale by calling 'setlocale', then converting the locale name into LCID by calling 'get_lcid'. This allows to take into account changes in the current locale from the default one, in contrast to GetThreadLocale. --- lib/localcharset.c~0 2014-02-09 09:41:26 +0200 +++ lib/localcharset.c 2014-07-03 18:17:44 +0300 @@ -34,6 +34,7 @@ #if defined _WIN32 || defined __WIN32__ # define WINDOWS_NATIVE +# include <locale.h> #endif #if defined __EMX__ @@ -461,14 +462,34 @@ locale_charset (void) static char buf[2 + 10 + 1]; - /* The Windows API has a function returning the locale's codepage as a - number: GetACP(). - When the output goes to a console window, it needs to be provided in - GetOEMCP() encoding if the console is using a raster font, or in - GetConsoleOutputCP() encoding if it is using a TrueType font. - But in GUI programs and for output sent to files and pipes, GetACP() - encoding is the best bet. */ - sprintf (buf, "CP%u", GetACP ()); + /* The Windows API has a function returning the locale's codepage as + a number, but the value doesn't change according to what the + 'setlocale' call specified. So we use it as a last resort, in + case the string returned by 'setlocale' doesn't specify the + codepage. */ + char *current_locale = setlocale (LC_ALL, NULL); + char *pdot; + + /* If they set different locales for different categories, + 'setlocale' will return a semi-colon separated list of locale + values. To make sure we use the correct one, we choose LC_CTYPE. */ + if (strchr (current_locale, ';')) + current_locale = setlocale (LC_CTYPE, NULL); + + pdot = strrchr (current_locale, '.'); + if (pdot) + sprintf (buf, "CP%s", pdot + 1); + else + { + /* The Windows API has a function returning the locale's codepage as a + number: GetACP(). + When the output goes to a console window, it needs to be provided in + GetOEMCP() encoding if the console is using a raster font, or in + GetConsoleOutputCP() encoding if it is using a TrueType font. + But in GUI programs and for output sent to files and pipes, GetACP() + encoding is the best bet. */ + sprintf (buf, "CP%u", GetACP ()); + } codeset = buf; #elif defined OS2 --- lib/localename.c~0 2014-02-09 09:41:26 +0200 +++ lib/localename.c 2014-07-03 18:11:45 +0300 @@ -60,6 +60,7 @@ #if defined WINDOWS_NATIVE || defined __CYGWIN__ /* Native Windows or Cygwin */ # define WIN32_LEAN_AND_MEAN # include <windows.h> +# include <winnls.h> /* List of language codes, sorted by value: 0x01 LANG_ARABIC 0x02 LANG_BULGARIAN @@ -1124,6 +1125,9 @@ # ifndef LOCALE_SNAME # define LOCALE_SNAME 0x5c # endif +# ifndef LOCALE_NAME_MAX_LENGTH +# define LOCALE_NAME_MAX_LENGTH 85 +# endif #endif @@ -2502,7 +2506,69 @@ gl_locale_name_from_win32_LCID (LCID lci return gl_locale_name_from_win32_LANGID (langid); } -#endif +#ifdef WINDOWS_NATIVE + +/* Two variables to interface between get_lcid and the EnumLocales + callback function below. */ +static LCID found_lcid; +static char lname[LC_MAX * (LOCALE_NAME_MAX_LENGTH + 1) + 1]; + +/* Callback function for EnumLocales. */ +static BOOL CALLBACK +enum_locales_fn (LPTSTR locale_num_str) +{ + char *endp; + char locval[2 * LOCALE_NAME_MAX_LENGTH + 1 + 1]; + LCID try_lcid = strtoul (locale_num_str, &endp, 16); + + if (GetLocaleInfo (try_lcid, LOCALE_SENGLANGUAGE, + locval, LOCALE_NAME_MAX_LENGTH)) + { + strcat (locval, "_"); + if (GetLocaleInfo (try_lcid, LOCALE_SENGCOUNTRY, + locval + strlen (locval), LOCALE_NAME_MAX_LENGTH)) + { + size_t locval_len = strlen (locval); + + if (strncmp (locval, lname, locval_len) == 0 + && (lname[locval_len] == '.' + || lname[locval_len] == '\0')) + { + found_lcid = try_lcid; + return FALSE; + } + } + } + return TRUE; +} + +/* Return the Locale ID (LCID) number given the locale's name, a + string, in LOCALE_NAME. This works by enumerating all the locales + supported by the system, until we find one whose name matches + LOCALE_NAME. */ +static LCID +get_lcid (const char *locale_name) +{ + /* A simple cache. */ + static LCID last_lcid; + static char last_locale[1000]; + + if (last_lcid > 0 && strcmp (locale_name, last_locale) == 0) + return last_lcid; + strncpy (lname, locale_name, sizeof (lname) - 1); + lname[sizeof (lname) - 1] = '\0'; + found_lcid = 0; + EnumSystemLocales (enum_locales_fn, LCID_SUPPORTED); + if (found_lcid > 0) + { + last_lcid = found_lcid; + strcpy (last_locale, locale_name); + } + return found_lcid; +} + +#endif /* WINDOWS_NATIVE */ +#endif /* WINDOWS_NATIVE || __CYGWIN__ */ #if HAVE_USELOCALE /* glibc or Mac OS X */ @@ -2660,6 +2726,26 @@ gl_locale_name_thread (int category, con const char *name = gl_locale_name_thread_unsafe (category, categoryname); if (name != NULL) return struniq (name); +#elif defined WINDOWS_NATIVE + if (LC_MIN <= category && category <= LC_MAX) + { + char *locname = setlocale (category, NULL); + + /* If CATEGORY is LC_ALL, the result might be a semi-colon + separated list of locales. We need only one, so we take the + one corresponding to LC_CTYPE, as the most important for + character translations. */ + if (strchr (locname, ';')) + locname = setlocale (LC_CTYPE, NULL); + + /* Convert locale name to LCID. We don't want to use + LocaleNameToLCID because (a) it is only available since Vista, + and (b) it doesn't accept locale names returned by 'setlocale'. */ + LCID lcid = get_lcid (locname); + + if (lcid > 0) + return gl_locale_name_from_win32_LCID (lcid); + } #endif return NULL; }