commit:     d1d1854a402b02b23b08725dac069984506f1f96
Author:     Kerin Millar <kfm <AT> plushkava <DOT> net>
AuthorDate: Mon Oct 20 03:54:38 2025 +0000
Commit:     Kerin Millar <kfm <AT> plushkava <DOT> net>
CommitDate: Mon Oct 20 05:11:44 2025 +0000
URL:        https://gitweb.gentoo.org/proj/locale-gen.git/commit/?id=d1d1854a

Produce short-form aliases for a subset of valid locales

Where localedef(1) is instructed to incorporate a directory into an
archive, it is the name of the directory that determines the apparent
name of the locale. However, if the name of the directory does not
incorporate a codeset part, localedef(1) will sniff the codeset from the
contents of the directory then proceed to incorporate two locales, with
one serving as an alias of the other.

# cd /usr/lib/locale
# localedef --no-archive -i en_US -f ISO-8859-1 en_US # ambiguous
# localedef --add-to-archive --replace en_US
# locale -a
en_US
en_US.iso88591

These short-form aliases are bad for interoperability. Even among
GNU/Linux vendors, there is no consensus as to the circumstances under
which they should be offered (if at all), nor the character maps that
should be implied by effecting any of them.

That being said, locale-gen-2.23 would incorporate these aliases under
certain conditions, whereas >=locale-gen-3.0 currently does not. As
such, there is a legitimate concern as to backward-compatibility. It
stands to reason that some Gentoo users will have chosen to effect a
short-form locale. For these locales to disappear can have adverse
effects on any applications and services that they depend upon.

Address this concern by having locale-gen(8) predictably incorporate
short-form aliases for certain locale/charmap combinations. Consider the
following configuration.

# All supported combinations for the USA
en_US.UTF-8    UTF-8
en_US          ISO-8859-1 # this will be aliased as "en_US"

# All supported combinations for China
zh_CN.GB18030  GB18030
zh_CN.GBK      GBK
zh_CN.UTF-8    UTF-8
zh_CN          GB2312 # this will be aliased as "zh_CN"

This is also the strategy employed by (at least) Fedora and Void Linux.

Reported-by: Horste Prote <prote <AT> fmi.uni-stuttgart.de>
Closes: https://bugs.gentoo.org/963974
Signed-off-by: Kerin Millar <kfm <AT> plushkava.net>

 locale-gen | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/locale-gen b/locale-gen
index 56f8eb1..b703f1d 100644
--- a/locale-gen
+++ b/locale-gen
@@ -64,7 +64,7 @@ umask 0022;
        }
 
        # Ensure that the C.UTF-8 locale is made available.
-       my @locales = ([ 'C', 'UTF-8', 'C.UTF-8' ]);
+       my @locales = ([ 'C', 'UTF-8', 'C.UTF-8', 'C.UTF-8' ]);
 
        # Compose a list of up to two configuration files to be read.
        my @config_files = select_config_files($prefix, %opt);
@@ -116,8 +116,8 @@ umask 0022;
        # Integrate the compiled locales into a new locale archive.
        my $src_path = do {
                my $prior_archive = $opt{'update'} ? $dst_path : undef;
-               my @canonicals = map +( $_->[2] ), @locales;
-               generate_archive($gentoo_prefix, $locale_dir, $prior_archive, 
@canonicals);
+               my @names = map +( $_->[3] ), @locales;
+               generate_archive($gentoo_prefix, $locale_dir, $prior_archive, 
@names);
        };
 
        # Install the new locale archive.
@@ -381,9 +381,9 @@ sub parse_config ($fh, $path, $supported_by) {
                # Determine the locale name in both the form that accords with
                # the subdirectories of /usr/share/i18n/locales, and in the
                # canonical form that incorporates the <codeset> part.
-               ($locale, my $canonical) = parse_entry($locale, $charmap);
+               ($locale, my $canonical, my $name) = parse_entry($locale, 
$charmap);
 
-               push @locales, [ $locale, $charmap, $canonical ];
+               push @locales, [ $locale, $charmap, $canonical, $name ];
        }
 
        return @locales;
@@ -391,6 +391,7 @@ sub parse_config ($fh, $path, $supported_by) {
 
 sub parse_entry ($locale, $charmap) {
        my $canonical;
+       my $name;
        if (2 == (my @fields = split /@/, $locale, 3)) {
                # de_DE@euro ISO-8859-15 => de_DE.ISO-8859-15@euro
                $canonical = sprintf '%s.%s@%s', $fields[0], $charmap, 
$fields[1];
@@ -401,8 +402,18 @@ sub parse_entry ($locale, $charmap) {
        } elsif (1 == @fields) {
                # en_US ISO-8859-1 => en_US.ISO-8859-1
                $canonical = "$locale.$charmap";
+
+               # Where given an input path whose name does not incorporate a
+               # charmap, localedef(1) will incorporate it into the archive as
+               # an alias of its canonical name. For example, "en_US" may be
+               # rendered a valid name that is equivalent to "en_US.iso88591".
+               # It is strongly discouraged to rely on these in any capacity.
+               # Still, for the time being, arrange for them to exist.
+               if ($locale ne 'C') {
+                       $name = $locale;
+               }
        }
-       return $locale, $canonical;
+       return $locale, $canonical, $name // $canonical;
 }
 
 sub check_archive_dir ($prefix, $locale_dir) {
@@ -454,7 +465,7 @@ sub generate_locales ($workers, @locales) {
                        last if 0 != ($status_by{$pid} = $?);
                }
 
-               my ($locale, $charmap, $canonical) = $locales[$i]->@*;
+               my ($locale, $charmap, $canonical, $name) = $locales[$i]->@*;
                printf "[%*d/%d] Compiling locale: %s\n",
                        $num_width, $i + 1, $total, $canonical;
 
@@ -464,7 +475,7 @@ sub generate_locales ($workers, @locales) {
                        last;
                } elsif ($pid == 0) {
                        @SIG{'INT', 'TERM'} = ('DEFAULT', 'DEFAULT');
-                       compile_locale($locale, $charmap, $canonical);
+                       compile_locale($locale, $charmap, $name);
                }
        } continue {
                last if $DEFERRED_SIGNAL;
@@ -493,8 +504,8 @@ sub generate_locales ($workers, @locales) {
        }
 }
 
-sub compile_locale ($locale, $charmap, $canonical) {
-       my $output_dir = "./$canonical";
+sub compile_locale ($locale, $charmap, $name) {
+       my $output_dir = "./$name";
        run('localedef', '--no-archive', '-i', $locale, '-f', $charmap, '--', 
$output_dir);
 }
 

Reply via email to