I've received several requests for adding new locale names,
both on this list, and off-list, from several people.

I've been trying to find a way to keep /usr/share/locale reasonably
clean while also allowing people to use their preferred locale names.

Currently, the list of supported locale names is represented by the
list of directories in /usr/share/locale. I don't think we should
continue to maintain a list of <language>_<country>.<encoding> names
because such a list cannot be maintained properly.

Some requests that have been made are non-functional changes.
E.g. adding a <country> doesn't have a functional effect on OpenBSD.
Still, some users would like to use names containing
<theirlanguage>_<theircountry>, for whatever reason.

There have also been requests for supporting locale names such
as "C.UTF-8". I'm not sure what the use case is but as a side-effect
of the proposal below such names would also be possible.

POSIX doesn't specify how files in /usr/share/locale are stored.
bluhm@ suggested to change the filesystem layout such that encoding
and language are separated. libc will look up locale definition data at
specific places depending on which of the LC_* categories is being set.

LC_CTYPE support code needs to look at the character encoding only.
It only cares about the encoding part of the locale name, which by
convention is the substring after the last dot in the locale name.

The suggested new layout looks like this:

  /usr/share/locale/UTF-8/LC_CTYPE
  /usr/share/locale/CP1251/LC_CTYPE
  /usr/share/locale/ISO8859-1/LC_CTYPE
  /usr/share/locale/ISO8859-15/LC_CTYPE
  /usr/share/locale/ISO8859-2/LC_CTYPE
  /usr/share/locale/ISO8859-7/LC_CTYPE
  /usr/share/locale/ARMSCII-8/LC_CTYPE
  /usr/share/locale/ISO8859-4/LC_CTYPE
  /usr/share/locale/ISO8859-13/LC_CTYPE
  /usr/share/locale/CP866/LC_CTYPE
  /usr/share/locale/KOI8-R/LC_CTYPE
  /usr/share/locale/ISO8859-5/LC_CTYPE
  /usr/share/locale/KOI8-U/LC_CTYPE

All other files and directories currently in /usr/share/locale
can be removed.

If we later add support for language- or country-specific features
such as LC_COLLATE we can add directories for every language the
collation code supports:

  /usr/share/locale/en/LC_COLLATE
  /usr/share/locale/es/LC_COLLATE
  /usr/share/locale/de/LC_COLLATE

Or even add country names, if necessary and supported by the
hypothetical collation code:
  /usr/share/locale/it_IT/LC_COLLATE
  /usr/share/locale/it_CH/LC_COLLATE

Does anyone see problems with this plan?

Index: share/locale/ctype/Makefile
===================================================================
RCS file: /cvs/src/share/locale/ctype/Makefile,v
retrieving revision 1.6
diff -u -p -r1.6 Makefile
--- share/locale/ctype/Makefile 16 Jul 2011 21:33:30 -0000      1.6
+++ share/locale/ctype/Makefile 30 May 2013 19:16:33 -0000
@@ -5,295 +5,81 @@ NOMAN=       # defined
 # pull LOCALEDIR and other declarations
 .include <bsd.own.mk>
 
-LOCALES += ar_SD.UTF-8
- LOCALESRC_ar_SD.UTF-8 = en_US.UTF-8
+LOCALES += UTF-8
+ LOCALESRC_UTF-8 = en_US.UTF-8
 
-LOCALES += ar_SY.UTF-8
- LOCALESRC_ar_SY.UTF-8 = en_US.UTF-8
+LOCALES += CP1251
+ LOCALESRC_CP1251 = bg_BG.CP1251
 
-LOCALES += bg_BG.CP1251
- LOCALESRC_bg_BG.CP1251 = bg_BG.CP1251
+LOCALES += ISO8859-1
+ LOCALESRC_ISO8859-1 = en_US.ISO_8859-1
 
-LOCALES += ca_ES.ISO8859-1
- LOCALESRC_ca_ES.ISO8859-1 = en_US.ISO_8859-1
+LOCALES += ISO8859-15
+ LOCALESRC_ISO8859-15 = en_US.DIS_8859-15
 
-LOCALES += ca_ES.ISO8859-15
- LOCALESRC_ca_ES.ISO8859-15 = en_US.DIS_8859-15
+LOCALES += ISO8859-2
+ LOCALESRC_ISO8859-2 = en_US.ISO_8859-2
 
-LOCALES += cs_CZ.ISO8859-2
- LOCALESRC_cs_CZ.ISO8859-2 = en_US.ISO_8859-2
+LOCALES += ISO8859-7
+ LOCALESRC_ISO8859-7 = el_GR.ISO8859-7
 
-LOCALES += da_DK.ISO8859-1
- LOCALESRC_da_DK.ISO8859-1 = en_US.ISO_8859-1
+LOCALES += ARMSCII-8
+ LOCALESRC_ARMSCII-8 = hy_AM.ARMSCII-8
 
-LOCALES += da_DK.ISO8859-15
- LOCALESRC_da_DK.ISO8859-15 = en_US.DIS_8859-15
+#LOCALES += ct
+# LOCALESRC_ct = ja_JP.CTEXT
 
-LOCALES += de_AT.ISO8859-1
- LOCALESRC_de_AT.ISO8859-1 = en_US.ISO_8859-1
+#LOCALES += eucJP
+# LOCALESRC_eucJP = ja_JP.eucJP
 
-LOCALES += de_AT.ISO8859-15
- LOCALESRC_de_AT.ISO8859-15 = en_US.DIS_8859-15
+#LOCALES += ISO2022-JP
+# LOCALESRC_ISO2022-JP = ja_JP.ISO-2022-JP
 
-LOCALES += de_CH.ISO8859-1
- LOCALESRC_de_CH.ISO8859-1 = en_US.ISO_8859-1
+#LOCALES += ISO2022-JP2
+# LOCALESRC_ISO2022-JP2 = ja_JP.ISO-2022-JP-2
 
-LOCALES += de_CH.ISO8859-15
- LOCALESRC_de_CH.ISO8859-15 = en_US.DIS_8859-15
+#LOCALES += SJIS
+# LOCALESRC_SJIS = ja_JP.SJIS
 
-LOCALES += de_DE.ISO8859-1
- LOCALESRC_de_DE.ISO8859-1 = en_US.ISO_8859-1
+#LOCALES += eucKR
+# LOCALESRC_eucKR = ko_KR.eucKR
 
-LOCALES += de_DE.ISO8859-15
- LOCALESRC_de_DE.ISO8859-15 = en_US.DIS_8859-15
+LOCALES += ISO8859-4
+ LOCALESRC_ISO8859-4 = en_US.ISO_8859-4
 
-LOCALES += de_DE.UTF-8
- LOCALESRC_de_DE.UTF-8 = en_US.UTF-8
+LOCALES += ISO8859-13
+ LOCALESRC_ISO8859-13 = lt_LT.ISO8859-13
 
-LOCALES += el_GR.ISO8859-7
- LOCALESRC_el_GR.ISO8859-7 = el_GR.ISO8859-7
+LOCALES += CP866
+ LOCALESRC_CP866 = ru_RU.CP866
 
-LOCALES += en_AU.ISO8859-1
- LOCALESRC_en_AU.ISO8859-1 = en_US.ISO_8859-1
+LOCALES += KOI8-R
+ LOCALESRC_KOI8-R = ru_RU.KOI8-R
 
-LOCALES += en_AU.ISO8859-15
- LOCALESRC_en_AU.ISO8859-15 = en_US.DIS_8859-15
+LOCALES += ISO8859-5
+ LOCALESRC_ISO8859-5 = ru_RU.ISO_8859-5
 
-LOCALES += en_CA.ISO8859-1
- LOCALESRC_en_CA.ISO8859-1 = en_US.ISO_8859-1
+LOCALES += KOI8-U
+ LOCALESRC_KOI8-U = uk_UA.KOI8-U
 
-LOCALES += en_CA.ISO8859-15
- LOCALESRC_en_CA.ISO8859-15 = en_US.DIS_8859-15
+#LOCALES += eucCN
+# LOCALESRC_eucCN = zh_CN.eucCN
 
-LOCALES += en_GB.ISO8859-1
- LOCALESRC_en_GB.ISO8859-1 = en_US.ISO_8859-1
+#LOCALES += GB18030
+# LOCALESRC_GB18030 = zh_CN.GB18030
 
-LOCALES += en_GB.ISO8859-15
- LOCALESRC_en_GB.ISO8859-15 = en_US.DIS_8859-15
-
-LOCALES += en_US.ISO8859-1
- LOCALESRC_en_US.ISO8859-1 = en_US.ISO_8859-1
-
-LOCALES += en_US.ISO8859-15
- LOCALESRC_en_US.ISO8859-15 = en_US.DIS_8859-15
-
-LOCALES += en_US.UTF-8
- LOCALESRC_en_US.UTF-8 = en_US.UTF-8
-
-LOCALES += es_ES.ISO8859-1
- LOCALESRC_es_ES.ISO8859-1 = en_US.ISO_8859-1
-
-LOCALES += es_ES.ISO8859-15
- LOCALESRC_es_ES.ISO8859-15 = en_US.DIS_8859-15
-
-LOCALES += es_ES.UTF-8
- LOCALESRC_es_ES.UTF-8 = en_US.UTF-8
-
-LOCALES += fa_IR.UTF-8
- LOCALESRC_fa_IR.UTF-8 = en_US.UTF-8
-
-LOCALES += fi_FI.ISO8859-1
- LOCALESRC_fi_FI.ISO8859-1 = en_US.ISO_8859-1
-
-LOCALES += fi_FI.ISO8859-15
- LOCALESRC_fi_FI.ISO8859-15 = en_US.DIS_8859-15
-
-LOCALES += fr_BE.ISO8859-1
- LOCALESRC_fr_BE.ISO8859-1 = en_US.ISO_8859-1
-
-LOCALES += fr_BE.ISO8859-15
- LOCALESRC_fr_BE.ISO8859-15 = en_US.DIS_8859-15
-
-LOCALES += fr_BE.UTF-8
- LOCALESRC_fr_BE.UTF-8 = en_US.UTF-8
-
-LOCALES += fr_CA.ISO8859-1
- LOCALESRC_fr_CA.ISO8859-1 = en_US.ISO_8859-1
-
-LOCALES += fr_CA.ISO8859-15
- LOCALESRC_fr_CA.ISO8859-15 = en_US.DIS_8859-15
-
-LOCALES += fr_CA.UTF-8
- LOCALESRC_fr_CA.UTF-8 = en_US.UTF-8
-
-LOCALES += fr_CH.ISO8859-1
- LOCALESRC_fr_CH.ISO8859-1 = en_US.ISO_8859-1
-
-LOCALES += fr_CH.ISO8859-15
- LOCALESRC_fr_CH.ISO8859-15 = en_US.DIS_8859-15
-
-LOCALES += fr_CH.UTF-8
- LOCALESRC_fr_CH.UTF-8 = en_US.UTF-8
-
-LOCALES += fr_FR.ISO8859-1
- LOCALESRC_fr_FR.ISO8859-1 = en_US.ISO_8859-1
-
-LOCALES += fr_FR.ISO8859-15
- LOCALESRC_fr_FR.ISO8859-15 = en_US.DIS_8859-15
-
-LOCALES += fr_FR.UTF-8
- LOCALESRC_fr_FR.UTF-8 = en_US.UTF-8
-
-LOCALES += hr_HR.ISO8859-2
- LOCALESRC_hr_HR.ISO8859-2 = en_US.ISO_8859-2
-
-LOCALES += hu_HU.ISO8859-2
- LOCALESRC_hu_HU.ISO8859-2 = en_US.ISO_8859-2
-
-LOCALES += hu_HU.UTF-8
- LOCALESRC_hu_HU.UTF-8 = en_US.UTF-8
-
-LOCALES += hy_AM.ARMSCII-8
- LOCALESRC_hy_AM.ARMSCII-8 = hy_AM.ARMSCII-8
-
-LOCALES += is_IS.ISO8859-1
- LOCALESRC_is_IS.ISO8859-1 = en_US.ISO_8859-1
-
-LOCALES += is_IS.ISO8859-15
- LOCALESRC_is_IS.ISO8859-15 = en_US.DIS_8859-15
-
-LOCALES += it_CH.ISO8859-1
- LOCALESRC_it_CH.ISO8859-1 = en_US.ISO_8859-1
-
-LOCALES += it_CH.ISO8859-15
- LOCALESRC_it_CH.ISO8859-15 = en_US.DIS_8859-15
-
-LOCALES += it_CH.UTF-8
- LOCALESRC_it_CH.UTF-8 = en_US.UTF-8
-
-LOCALES += it_IT.ISO8859-1
- LOCALESRC_it_IT.ISO8859-1 = en_US.ISO_8859-1
-
-LOCALES += it_IT.ISO8859-15
- LOCALESRC_it_IT.ISO8859-15 = en_US.DIS_8859-15
-
-LOCALES += it_IT.UTF-8
- LOCALESRC_it_IT.UTF-8 = en_US.UTF-8
-
-#LOCALES += ja_JP.ct
-# LOCALESRC_ja_JP.ct = ja_JP.CTEXT
-
-#LOCALES += ja_JP.eucJP
-# LOCALESRC_ja_JP.eucJP = ja_JP.eucJP
-
-#LOCALES += ja_JP.ISO2022-JP
-# LOCALESRC_ja_JP.ISO2022-JP = ja_JP.ISO-2022-JP
-
-#LOCALES += ja_JP.ISO2022-JP2
-# LOCALESRC_ja_JP.ISO2022-JP2 = ja_JP.ISO-2022-JP-2
-
-#LOCALES += ja_JP.SJIS
-# LOCALESRC_ja_JP.SJIS = ja_JP.SJIS
-
-LOCALES += ja_JP.UTF-8
- LOCALESRC_ja_JP.UTF-8 = en_US.UTF-8
-
-#LOCALES += ko_KR.eucKR
-# LOCALESRC_ko_KR.eucKR = ko_KR.eucKR
-
-LOCALES += ko_KR.UTF-8
- LOCALESRC_ko_KR.UTF-8 = en_US.UTF-8
-
-LOCALES += lt_LT.ISO8859-4
- LOCALESRC_lt_LT.ISO8859-4 = en_US.ISO_8859-4
-
-LOCALES += lt_LT.ISO8859-13
- LOCALESRC_lt_LT.ISO8859-13 = lt_LT.ISO8859-13
-
-LOCALES += nl_BE.ISO8859-1
- LOCALESRC_nl_BE.ISO8859-1 = en_US.ISO_8859-1
-
-LOCALES += nl_BE.ISO8859-15
- LOCALESRC_nl_BE.ISO8859-15 = en_US.DIS_8859-15
-
-LOCALES += nl_NL.ISO8859-1
- LOCALESRC_nl_NL.ISO8859-1 = en_US.ISO_8859-1
-
-LOCALES += nl_NL.ISO8859-15
- LOCALESRC_nl_NL.ISO8859-15 = en_US.DIS_8859-15
-
-LOCALES += no_NO.ISO8859-1
- LOCALESRC_no_NO.ISO8859-1 = en_US.ISO_8859-1
-
-LOCALES += no_NO.ISO8859-15
- LOCALESRC_no_NO.ISO8859-15 = en_US.DIS_8859-15
-
-LOCALES += pl_PL.ISO8859-2
- LOCALESRC_pl_PL.ISO8859-2 = en_US.ISO_8859-2
-
-LOCALES += pl_PL.UTF-8
- LOCALESRC_pl_PL.UTF-8 = en_US.UTF-8
-
-LOCALES += pt_PT.ISO8859-1
- LOCALESRC_pt_PT.ISO8859-1 = en_US.ISO_8859-1
-
-LOCALES += pt_PT.UTF-8
- LOCALESRC_pt_PT.UTF-8 = en_US.UTF-8
-
-LOCALES += pt_PT.ISO8859-15
- LOCALESRC_pt_PT.ISO8859-15 = en_US.DIS_8859-15
-
-LOCALES += ro_RO.UTF-8
- LOCALESRC_ro_RO.UTF-8 = en_US.UTF-8
-
-LOCALES += ru_RU.CP866
- LOCALESRC_ru_RU.CP866 = ru_RU.CP866
-
-LOCALES += ru_RU.KOI8-R
- LOCALESRC_ru_RU.KOI8-R = ru_RU.KOI8-R
-
-LOCALES += ru_RU.ISO8859-5
- LOCALESRC_ru_RU.ISO8859-5 = ru_RU.ISO_8859-5
-
-LOCALES += ru_RU.UTF-8
- LOCALESRC_ru_RU.UTF-8 = en_US.UTF-8
-
-LOCALES += sk_SK.ISO8859-2
- LOCALESRC_sk_SK.ISO8859-2 = en_US.ISO_8859-2
-
-LOCALES += sl_SI.ISO8859-2
- LOCALESRC_sl_SI.ISO8859-2 = en_US.ISO_8859-2
-
-LOCALES += sv_SE.ISO8859-1
- LOCALESRC_sv_SE.ISO8859-1 = en_US.ISO_8859-1
-
-LOCALES += sv_SE.ISO8859-15
- LOCALESRC_sv_SE.ISO8859-15 = en_US.DIS_8859-15
-
-LOCALES += sv_SE.UTF-8
- LOCALESRC_sv_SE.UTF-8 = en_US.UTF-8
-
-LOCALES += uk_UA.KOI8-U
- LOCALESRC_uk_UA.KOI8-U = uk_UA.KOI8-U
-
-#LOCALES += zh_CN.eucCN
-# LOCALESRC_zh_CN.eucCN = zh_CN.eucCN
-
-#LOCALES += zh_CN.GB18030
-# LOCALESRC_zh_CN.GB18030 = zh_CN.GB18030
-
-LOCALES += zh_CN.UTF-8
- LOCALESRC_zh_CN.UTF-8 = en_US.UTF-8
-
-#LOCALES += zh_TW.Big5
-# LOCALESRC_zh_TW.Big5 = zh_TW.BIG5
+#LOCALES += Big5
+# LOCALESRC_Big5 = zh_TW.BIG5
 
 # XXX: EUC-TW is not EUC!
-#LOCALES += zh_TW.eucTW
-# LOCALESRC_zh_TW.eucTW = zh_TW.eucTW
-
-LOCALES += zh_TW.UTF-8
- LOCALESRC_zh_TW.UTF-8 = en_US.UTF-8
+#LOCALES += eucTW
+# LOCALESRC_eucTW = zh_TW.eucTW
 
 all: ${LOCALES:S/$/.out/g}
 realall: ${LOCALES:S/$/.out/g}
 
-.for locale in ${LOCALES}
-LOCALESRCS+=   ${LOCALESRC_${locale}}
-.endfor
 CLEANFILES+=   ${LOCALES:S/$/.out/g}
 
-# TODO: more use of symlinks?
 FILES= ${LOCALES:S/$/.out/g}
 .for locale in ${LOCALES}
 FILESDIR_${locale}.out=        ${LOCALEDIR}/${locale}
Index: lib/libc/locale/setrunelocale.c
===================================================================
RCS file: /cvs/src/lib/libc/locale/setrunelocale.c,v
retrieving revision 1.9
diff -u -p -r1.9 setrunelocale.c
--- lib/libc/locale/setrunelocale.c     30 May 2013 18:35:55 -0000      1.9
+++ lib/libc/locale/setrunelocale.c     30 May 2013 19:23:16 -0000
@@ -171,17 +171,27 @@ found:
 }
 
 int
-_xpg4_setrunelocale(const char *encoding)
+_xpg4_setrunelocale(const char *locname)
 {
        char path[PATH_MAX];
        _RuneLocale *rl;
        int error, len;
+       const char *dot, *encoding;
 
-       if (!strcmp(encoding, "C") || !strcmp(encoding, "POSIX")) {
+       if (!strcmp(locname, "C") || !strcmp(locname, "POSIX")) {
                rl = &_DefaultRuneLocale;
                goto found;
        }
 
+       /* Assume "<whatever>.<encoding>" locale name. */
+       dot = strrchr(locname, '.');
+       if (dot == NULL) {
+               /* No encoding specified. Fall back to ASCII. */
+               rl = &_DefaultRuneLocale;
+               goto found;
+       }
+
+       encoding = dot + 1;
        len = snprintf(path, sizeof(path),
            "%s/%s/LC_CTYPE", _PATH_LOCALE, encoding);
        if (len < 0 || len >= sizeof(path))

Reply via email to