On Mon, Oct 14, 2024 at 01:25:31PM +0100, Gavin Smith wrote: > These numbers do not seem so high that we need an ultra-optimised > algorithm (developed by C++ boffins) to deal with them - anything faster > than a linear search would probably good enough. > > I have some spare time over the next couple of days so may be able > to come up with something. > > We do not need the full level of generality that a general purpose library > in C would likely provide as the hashmap is from std::string (char * from > the C code) and maps to an int, and moreover keys only take the value 1 or > are undefined.
I may as well post what I've been able to come up with. The new code is about 150 lines of C, in the file convert/hashmap.c (patch below). Probably not perfect but hopefully simple enough to be maintainable. I've tested this with three manuals with teximakehtml: (seconds) manual C++ C linear search elisp 1.26 1.26 1.62 libc 1.50 1.51 1.82 texinfo 0.39 0.40 0.47 There was no detectable difference in memory usage. (I thought I might have made a mistake as the numbers for C and C++ implementations seemed so close.) I've checked with valgrind that all memory is freed correctly at the end. I used block allocation to save time on cleanup. If this approach looks ok I can try to tidy it up and commit it. We could also experiment with different numbers of hash bins or the memory allocator. I'll read a bit more about hash tables as I don't know a lot about this subject. It could also be possible to use a completely different algorithm like a binary tree. diff --git a/tp/Texinfo/XS/Makefile.am b/tp/Texinfo/XS/Makefile.am index 101038877b..9a9fcc622e 100644 --- a/tp/Texinfo/XS/Makefile.am +++ b/tp/Texinfo/XS/Makefile.am @@ -404,6 +404,8 @@ C_libtexinfo_convert_sources = \ convert/convert_html.c \ convert/format_html.h \ convert/format_html.c \ + convert/hashmap.c \ + convert/hashmap.h \ convert/html_converter_types.h \ convert/html_converter_init_options.c \ convert/html_converter_finish.c \ diff --git a/tp/Texinfo/XS/convert/converter.c b/tp/Texinfo/XS/convert/converter.c index e5d38c25ea..fe96a53b75 100644 --- a/tp/Texinfo/XS/convert/converter.c +++ b/tp/Texinfo/XS/convert/converter.c @@ -283,6 +283,8 @@ new_converter (enum converter_format format, unsigned long flags) /* set low level data representations options */ if (flags & CONVF_string_list) converter->ids_data_type = IDT_string_list; + else if (flags & CONVF_hashmap) + converter->ids_data_type = IDT_hashmap; #ifdef HAVE_CXX_HASHMAP else if (flags & CONVF_cxx_hashmap) converter->ids_data_type = IDT_cxx_hashmap; @@ -454,10 +456,9 @@ converter_converter (enum converter_format format, CONVERTER_INITIALIZATION_INFO *format_defaults; unsigned long flags; - /* NOTE if HAVE_CXX_HASHMAP is not set, even with CONVF_cxx_hashmap - string lists will be used */ if (!converter_flags) - flags = CONVF_cxx_hashmap; + //flags = CONVF_cxx_hashmap; + flags = CONVF_hashmap; /* To use a string list. Slower. flags = CONVF_string_list; diff --git a/tp/Texinfo/XS/convert/get_converter_perl_info.c b/tp/Texinfo/XS/convert/get_converter_perl_info.c index 364bcb2af2..bece2fff3b 100644 --- a/tp/Texinfo/XS/convert/get_converter_perl_info.c +++ b/tp/Texinfo/XS/convert/get_converter_perl_info.c @@ -105,7 +105,8 @@ get_or_create_sv_converter (SV *converter_in, const char *input_class) } converter_descriptor = new_converter (converter_format, - CONVF_perl_hashmap); + //CONVF_perl_hashmap); + CONVF_hashmap); /* CONVF_string_list); */ diff --git a/tp/Texinfo/XS/convert/hashmap.c b/tp/Texinfo/XS/convert/hashmap.c new file mode 100644 index 0000000000..dd55230e6b --- /dev/null +++ b/tp/Texinfo/XS/convert/hashmap.c @@ -0,0 +1,164 @@ +/* Copyright 2024 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> + +#include "converter_types.h" + +#include "hashmap.h" + +typedef struct BUCKET { + /* Linked list of strings. */ + char *string; + struct BUCKET *next; +} BUCKET; + +/* Allocator for bucket object. */ +#define BUCKETS_PER_ARENA 64 + +typedef struct BUCKET_ARENA { + BUCKET buckets[BUCKETS_PER_ARENA]; + int used; + struct BUCKET_ARENA *next; +} BUCKET_ARENA; + +#define NBUCKETS 256 +typedef struct C_HASHMAP { + BUCKET *bucket[NBUCKETS]; + long count; + + BUCKET_ARENA *arena; +} C_HASHMAP; + +static BUCKET * +new_bucket (C_HASHMAP *H) +{ + if (H->arena->used < BUCKETS_PER_ARENA) + return &H->arena->buckets[H->arena->used++]; + + BUCKET_ARENA *new_arena = malloc (sizeof (BUCKET_ARENA)); + memset (new_arena, 0, sizeof (BUCKET_ARENA)); + + /* Add to front of list. */ + new_arena->next = H->arena; + H->arena = new_arena; + + return &H->arena->buckets[H->arena->used++]; +} + + +static unsigned int +hash_string (const char *string) +{ + unsigned int hash = 0; + + char c; + const char *pc = string; + + while ((c = *pc)) + { + hash *= 127; /* prime */ + hash += c; + pc++; + } + + hash %= NBUCKETS; + + return hash; +} + +void +init_registered_ids_c_hashmap (CONVERTER *self) +{ + C_HASHMAP *H = malloc (sizeof (C_HASHMAP)); + memset (H, 0, sizeof (C_HASHMAP)); + + H->arena = malloc (sizeof (BUCKET_ARENA)); + memset (H->arena, 0, sizeof (BUCKET_ARENA)); + + self->registered_ids_c_hashmap = H; +} + +int +is_c_hashmap_registered_id (CONVERTER *self, const char *in_string) +{ + C_HASHMAP *H = (C_HASHMAP *)self->registered_ids_c_hashmap; + unsigned int hash = hash_string(in_string); + BUCKET *B = H->bucket[hash]; + + while (B) + { + if (!strcmp(B->string, in_string)) + return 1; + B = B->next; + } + + return 0; +} + +void +c_hashmap_register_id (CONVERTER *self, const char *in_string) +{ + C_HASHMAP *H = (C_HASHMAP *)self->registered_ids_c_hashmap; + + BUCKET *new = new_bucket(H); + new->string = strdup (in_string); + unsigned int hash = hash_string(in_string); + + /* Add to front of linked list. */ + new->next = H->bucket[hash]; + H->bucket[hash] = new; + + H->count++; +} + +void +clear_registered_ids_c_hashmap (CONVERTER *self) +{ + C_HASHMAP *H = (C_HASHMAP *)self->registered_ids_c_hashmap; + int i; + + BUCKET_ARENA *arena, *next; + /* Free chain. */ + next = H->arena; + while (next) + { + arena = next; + next = arena->next; + + /* TODO: also need to loop through until arena->used and + free strings. */ + for (i = 0; i < arena->used; i++) + { + free (arena->buckets[i].string); + } + free (arena); + } + + memset (H, 0, sizeof (C_HASHMAP)); +} + +void +free_registered_ids_c_hashmap (CONVERTER *self) +{ + C_HASHMAP *H = (C_HASHMAP *)self->registered_ids_c_hashmap; + clear_registered_ids_c_hashmap (self); + free (H); +} + diff --git a/tp/Texinfo/XS/convert/hashmap.h b/tp/Texinfo/XS/convert/hashmap.h new file mode 100644 index 0000000000..3f7c67396a --- /dev/null +++ b/tp/Texinfo/XS/convert/hashmap.h @@ -0,0 +1,27 @@ +/* hashmap.h - declarations for hashmap.c */ +#ifndef HASHMAP_H +#define HASHMAP_H + +/* Copyright 2024 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. */ + +void init_registered_ids_c_hashmap (CONVERTER *self); +int is_c_hashmap_registered_id (CONVERTER *self, const char *in_string); +void c_hashmap_register_id (CONVERTER *self, const char *in_string); +void clear_registered_ids_c_hashmap (CONVERTER *self); +void free_registered_ids_c_hashmap (CONVERTER *self); + + +#endif diff --git a/tp/Texinfo/XS/convert/html_converter_finish.c b/tp/Texinfo/XS/convert/html_converter_finish.c index 9ecbf80e63..d5bdf7eb33 100644 --- a/tp/Texinfo/XS/convert/html_converter_finish.c +++ b/tp/Texinfo/XS/convert/html_converter_finish.c @@ -31,6 +31,7 @@ #include "api_to_perl.h" #include "call_html_perl_function.h" #include "call_html_cxx_function.h" +#include "hashmap.h" /* html_reset_translated_special_unit_info_tree html_clear_direction_string_type */ #include "convert_html.h" @@ -161,6 +162,8 @@ html_reset_converter (CONVERTER *self) if (self->ids_data_type == IDT_perl_hashmap) clear_registered_ids_hv (self); + else if (self->ids_data_type == IDT_hashmap) + clear_registered_ids_c_hashmap (self); #ifdef HAVE_CXX_HASHMAP else if (self->ids_data_type == IDT_cxx_hashmap) clear_registered_ids_hashmap (self); @@ -342,6 +345,8 @@ html_free_converter (CONVERTER *self) if (self->ids_data_type == IDT_perl_hashmap) free_registered_ids_hv (self); + else if (self->ids_data_type == IDT_hashmap) + free_registered_ids_c_hashmap (self); #ifdef HAVE_CXX_HASHMAP else if (self->ids_data_type == IDT_cxx_hashmap) free_registered_ids_hashmap (self); diff --git a/tp/Texinfo/XS/convert/html_prepare_converter.c b/tp/Texinfo/XS/convert/html_prepare_converter.c index 3d786e1c1f..5034489fc2 100644 --- a/tp/Texinfo/XS/convert/html_prepare_converter.c +++ b/tp/Texinfo/XS/convert/html_prepare_converter.c @@ -55,6 +55,7 @@ #include "converter.h" #include "call_html_perl_function.h" #include "call_html_cxx_function.h" +#include "hashmap.h" #include "format_html.h" /* html_complete_no_arg_commands_formatting html_run_stage_handlers html_add_to_files_source_info html_find_file_source_info @@ -1721,6 +1722,8 @@ html_converter_customize (CONVERTER *self) if (self->ids_data_type == IDT_perl_hashmap) init_registered_ids_hv (self); + else if (self->ids_data_type == IDT_hashmap) + init_registered_ids_c_hashmap (self); #ifdef HAVE_CXX_HASHMAP else if (self->ids_data_type == IDT_cxx_hashmap) init_registered_ids_hashmap (self); @@ -3768,6 +3771,8 @@ html_id_is_registered (CONVERTER *self, const char *string) { if (self->ids_data_type == IDT_perl_hashmap) return is_hv_registered_id (self, string); + else if (self->ids_data_type == IDT_hashmap) + return is_c_hashmap_registered_id (self, string); #ifdef HAVE_CXX_HASHMAP else if (self->ids_data_type == IDT_cxx_hashmap) return is_hashmap_registered_id (self, string); @@ -3781,6 +3786,8 @@ html_register_id (CONVERTER *self, const char *string) { if (self->ids_data_type == IDT_perl_hashmap) hv_register_id (self, string); + else if (self->ids_data_type == IDT_hashmap) + c_hashmap_register_id (self, string); #ifdef HAVE_CXX_HASHMAP else if (self->ids_data_type == IDT_cxx_hashmap) hashmap_register_id (self, string); diff --git a/tp/Texinfo/XS/main/converter_types.h b/tp/Texinfo/XS/main/converter_types.h index 16a14328ee..cca8cf20b9 100644 --- a/tp/Texinfo/XS/main/converter_types.h +++ b/tp/Texinfo/XS/main/converter_types.h @@ -42,12 +42,14 @@ enum ids_data_type { IDT_perl_hashmap, IDT_cxx_hashmap, IDT_string_list, + IDT_hashmap, }; /* converter low level customization */ #define CONVF_perl_hashmap 0x0001 #define CONVF_string_list 0x0002 #define CONVF_cxx_hashmap 0x0004 +#define CONVF_hashmap 0x0008 /* for string information passing to/from perl */ enum sv_string_type { @@ -883,6 +885,7 @@ typedef struct CONVERTER { STRING_LIST *registered_ids; /* actually HV * but we do not want to drag in Perl headers */ void *registered_ids_hv; + void *registered_ids_c_hashmap; #ifdef HAVE_CXX_HASHMAP /* a pointer on C++ data */ void *registered_ids_hashmap; diff --git a/tp/Texinfo/XS/teximakehtml.c b/tp/Texinfo/XS/teximakehtml.c index 16029d4bd8..ecb353215e 100644 --- a/tp/Texinfo/XS/teximakehtml.c +++ b/tp/Texinfo/XS/teximakehtml.c @@ -313,10 +313,11 @@ main (int argc, char *argv[]) &converter_texinfo_language_config_dirs, &convert_options, /* default, use C++ hashmap if available */ - 0); - /* to test linear search - CONVF_string_list); - */ + //0); + /* to test linear search */ + //CONVF_string_list); + /* to test C implementation of hashmap */ + CONVF_hashmap); free_strings_list (&converter_texinfo_language_config_dirs); free_strings_list (&texinfo_language_config_dirs);