commit:     fc0c62e2a69cd2e15910f407a4092487c78c63d8
Author:     Kerin Millar <kfm <AT> plushkava <DOT> net>
AuthorDate: Wed Nov 19 16:57:09 2025 +0000
Commit:     Kerin Millar <kfm <AT> plushkava <DOT> net>
CommitDate: Wed Nov 19 18:00:05 2025 +0000
URL:        https://gitweb.gentoo.org/proj/locale-gen.git/commit/?id=fc0c62e2

Allow localename fields to forgo a (redundant) codeset

Presently, the locale.gen(5) configuration file is validated strictly
against the contents of the SUPPORTED file. Consequently, for any given
pair of localename and charmap fields, it may or may not be necessary to
redundantly incorporate the charmap into the localename field.

# Valid. Note that the the charmap must be expressed twice.
en_US.UTF-8 UTF-8

# Valid.
en_US ISO-8559-1

# Invalid because there is no precise match against a SUPPORTED entry.
en_US UTF_8

This commit adjusts the locale.gen(5) validation methodology in such a
way that it is never necessary to incorporate the charmap into the
localename field. Backward-compatibility is maintained. That is, any
configuration that could be validated by locale-gen 3.9 shall continue
to be valid.

To understand how this is accomplished, consider the prior behaviour of
the map_supported_combinations() subroutine. Imagine that is tasked with
reading the two entries in the SUPPORTED file that pertain to the United
States. In that case, it would generate the following data structure.

{
   "en_US": {
      "ISO-8859-1": 1, // allows "en_US ISO-8859-1" with "en_US" alias
   },
   "en_US.UTF-8": {
      "UTF-8": 1       // allows "en_US.UTF-8 UTF-8"
   }
}

Now, the data structure shall instead be as follows.

{
   "en_US": {
      "ISO-8859-1": 1, // allows "en_US ISO-8859-1" with "en_US" alias
      "UTF-8": 0       // allows "en_US UTF-8"
   },
   "en_US.UTF-8": {
      "UTF-8": 0       // allows "en_US.UTF-8 UTF-8"
   }
}

Note that the decision as to whether to generate a short-form alias is
based on the (now meaningful) value of the innermost key, duly ensuring
that there are no backward-compatibility breaks in that regard.

The mkconfig utility has been updated so as to refrain from
incorporating the UTF-8 charmap into the first field of each of the
entries that it generates.

Finally, The man pages have yet to be modified so as to precisely
document the validation behaviour. They shall be attended to by a
forthcoming commit.

See-also: 588e3ede89a6b630420839e704cf975a28f0d981
See-also: d1d1854a402b02b23b08725dac069984506f1f96
Signed-off-by: Kerin Millar <kfm <AT> plushkava.net>

 locale-gen | 29 ++++++++++++++++++++++++-----
 mkconfig   | 10 ++++++----
 2 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/locale-gen b/locale-gen
index 1c25e6b..e715807 100644
--- a/locale-gen
+++ b/locale-gen
@@ -316,7 +316,23 @@ sub map_supported_combinations ($prefix) {
        while (my $line = readline $fh) {
                chomp $line;
                if (2 == (my ($locale, $charmap) = split ' ', $line)) {
-                       $supported_by{$locale}{$charmap} = 1;
+                       # Designate the locale/charmap combination as supported.
+                       $supported_by{$locale}{$charmap} = 0;
+
+                       # Determine whether the locale merits a short-form alias
+                       # by attempting to strip its codeset part, if any. See
+                       # the parse_entry() subroutine as to the implications.
+                       my $is_aliasable = $locale !~ s/\.[^@]+//;
+
+                       # Designate the locale/charmap combination as supported,
+                       # and potentially as one that merits a short-form alias.
+                       # Since the codeset part was stripped, this also makes
+                       # it possible to specify locales without incorporating
+                       # a redundant charmap. For example, "en_US.UTF-8 UTF-8"
+                       # may instead be specified as "en_US UTF-8".
+                       $supported_by{$locale}{$charmap} = $is_aliasable;
+
+                       # Designate the charmap as supported in its own right.
                        $supported_by{''}{$charmap} = 1;
                }
        }
@@ -367,18 +383,19 @@ sub parse_config ($fh, $path, $supported_by) {
                }
 
                # Validate both locale and character map before accepting.
+               my $is_aliasable;
                if (! $supported_by->{$locale}) {
                        $thrower->('Invalid locale', $line);
                } elsif (! $supported_by->{''}{$charmap}) {
                        $thrower->('Invalid charmap', $line);
-               } elsif (! $supported_by->{$locale}{$charmap}) {
+               } elsif (! defined($is_aliasable = 
$supported_by->{$locale}{$charmap})) {
                        $thrower->('Unsupported locale/charmap combination', 
$line);
                }
 
                # Determine the locale name in both the form that accords with
                # the subdirectories of /usr/share/i18n/locales, and in the
                # canonical form that incorporates the <codeset> part.
-               ($locale, my $canonical, my $name) = parse_entry($locale, 
$charmap);
+               ($locale, my $canonical, my $name) = parse_entry($locale, 
$charmap, $is_aliasable);
 
                push @locales, [ $locale, $charmap, $canonical, $name ];
        }
@@ -386,7 +403,7 @@ sub parse_config ($fh, $path, $supported_by) {
        return @locales;
 }
 
-sub parse_entry ($locale, $charmap) {
+sub parse_entry ($locale, $charmap, $is_aliasable) {
        my $canonical;
        my $name;
        if (2 == (my @fields = split /@/, $locale, 3)) {
@@ -405,7 +422,9 @@ sub parse_entry ($locale, $charmap) {
                # of its canonical name. For example, "en_US" may refer to
                # "en_US.iso88591". It is strongly discouraged to rely on this
                # behaviour. Still, for now, arrange for such aliases to exist.
-               $name = $locale;
+               if ($is_aliasable) {
+                       $name = $locale;
+               }
        }
        return $locale, $canonical, $name // $canonical;
 }

diff --git a/mkconfig b/mkconfig
index a57940c..3b57912 100755
--- a/mkconfig
+++ b/mkconfig
@@ -34,10 +34,12 @@ use File::Slurper qw(read_dir read_lines read_text);
        };
 
        for my $line (@lines) {
-               my ($read_locale, $charmap) = split ' ', $line;
+               my ($locale, $charmap) = split ' ', $line;
 
-               # The names of the templates don't incorporate a codeset part.
-               my $locale = $read_locale =~ s/\.[^@]+//r;
+               # The names of the templates do not incorporate a codeset part.
+               # Further, as of locale-gen 3.10, the locale.gen(5) file never
+               # requires for the locale field to incorporate a codeset.
+               $locale =~ s/\.[^@]+//;
 
                # Select only UTF-8 locales and refrain from incorporating the
                # C.UTF-8 locale because is always compiled by locale-gen(8).
@@ -54,7 +56,7 @@ use File::Slurper qw(read_dir read_lines read_text);
                        if (length $territory) {
                                $comment .= " ($territory)";
                        }
-                       printf {$pipe} "# %s\037%s\037# %s\n", $read_locale, 
$charmap, $comment;
+                       printf {$pipe} "# %s\037%s\037# %s\n", $locale, 
$charmap, $comment;
                }
        }
        close $pipe or exit 1;

Reply via email to