I've made a quick little modification to allow me to translate accented
characters down to their low ascii equivalent. Basically it just adds a
second word to the database with all low ascii characters. This way I
can still let my users search for accents if they want, but aren't forced
to enter them.
The translation is also hard coded right now :( I was thinking of adding a
config file paramater such as "translation_table: 123-162=a 183-184=d" etc
if this feature is useful and so others could adapt it without actually
changing the code =)
The only problem with this is that htsearch can't highlight the word
searched for because the accents are no longer there. I just use the
"no_excerpt_show_top: true" in my config file to prevent a bunch of
"Search word not found in top of document." Not a big deal for me because
the important thing is that it actually finds relavant documents.
I'm by no means a very good c++ coder so any suggestions would be great.
Is there a better file for me to be modifying instead of htdig/Retriever?
Thanks for your time,
Alex Chan
*** Retriever.cc Wed Jun 30 16:34:02 1999
--- ../../htdig-3.1.2/htdig/Retriever.cc Wed Apr 21 22:47:57 1999
*************** Retriever::Retriever(RetrieverLog flags)
*** 83,121 ****
fclose(urls_parsed);
}
unlink(filelog);
}
-
-
- // Create the lookup table
- if (config.Boolean("translateaccents", 0)) {
- cout << "setting up transtable: "<<endl;
- for ( unsigned char i = 0; i < 255 ; i++)
- {
- transtable[i] = i;
- }
- // Specific translation range
- // htdig is case insensitive but upper case
- // is included just in case
- TableEntry(224, 230, 'a');
- TableEntry(192, 198, 'A');
- TableEntry(231, 231, 'c');
- TableEntry(199, 199, 'C');
- TableEntry(232, 235, 'e');
- TableEntry(200, 203, 'E');
- TableEntry(236, 239, 'i');
- TableEntry(204, 207, 'I');
- TableEntry(236, 239, 'i');
- TableEntry(241, 241, 'n');
- TableEntry(209, 209, 'N');
- TableEntry(242, 246, 'o');
- TableEntry(210, 214, 'O');
- TableEntry(249, 252, 'u');
- TableEntry(217, 220, 'U');
- TableEntry(253, 255, 'y');
- TableEntry(221, 221, 'Y');
- }
-
}
//*****************************************************************************
--- 83,90 ----
*************** Retriever::~Retriever()
*** 125,161 ****
{
delete doc;
}
- //*******************************************************
- // Retriever::TableEntry(int start, int finish, unsigned char letter)
- // Enters an alternate value into the tranlsation table
- void
- Retriever::TableEntry(int start, int finish, unsigned char letter)
- {
- for (int i=start; i <= finish; i++)
- {
- transtable[i] = letter;
- }
- }
-
- int
- Retriever::Translate(char* w)
- {
- unsigned char* word = w; // Change the sign in order to ease table lookup
- int HighAsciiFound = false;
- while (*word)
- {
- if ( *word > 127 )
- {
- *word = transtable[*word];
- HighAsciiFound = true;
- }
- word++;
- }
- return HighAsciiFound;
- }
-
//*****************************************************************************
// void Retriever::setUsernamePassword(char *credentials)
//
--- 94,101 ----
*************** Retriever::GetRef(char *u)
*** 928,936 ****
//
void
Retriever::got_word(char *word, int location, int heading)
{
- static bool translateaccents = config.Boolean("translateaccents", 0);
if (debug > 3)
cout << "word: " << word << '@' << location << endl;
if (heading > 11 || heading < 0) // Current limits for headings
heading = 0; // Assume it's just normal text
--- 868,875 ----
*************** Retriever::got_word(char *word, int loca
*** 938,956 ****
{
String w = word;
HtStripPunctuation(w);
if (w.length() >= minimumWordLength)
- {
words.Word(w, location, current_anchor_number, factor[heading]);
-
- if (translateaccents){
- if (Translate(w)) {
- // Add the word in again with accents translated down
- words.Word(w, location, current_anchor_number,
factor[heading]);
- }
-
- }
- }
}
}
--- 877,885 ----
*** Retriever.h Wed Jun 30 14:57:23 1999
--- ../../htdig-3.1.2/htdig/Retriever.h Wed Apr 21 22:47:57 1999
*************** private:
*** 123,136 ****
void RetrievedDocument(Document &, char *url, DocumentRef *ref);
void parse_url(URLRef &urlRef);
void got_redirect(char *, DocumentRef *);
void recordNotFound(char *url, char *referer, int reason);
-
-
-
- int Translate( char* word);
- void TableEntry(int start, int finish, unsigned char letter);
- unsigned char transtable[255];
};
#endif
--- 123,130 ----