This may work for your needs with a little fine tuning. Special and accented
characters can be represented in HTML with a character name or a numeric
value. For example, " can be represented as " or as " and it
appears from your example that both are used. I've attached a
dput(HTMLChars) to the end of this message with the concordances. The
following works on your data, but I haven't included any error checking.
Assuming your .csv file is called txt and the data.frame HTMLChars is
loaded:

# Search for &Name;
lsta <- unique(unlist(regmatches(txt, gregexpr("&[[:alpha:]]+;", txt))))
lsta <- data.frame(Name=lsta)
matches <- merge(HTMLChars, lsta)
for (i in 1:nrow(matches)) {
     txt <- gsub(matches$Name[i], matches$Character[i], txt)
}

# Search for &#Number;
lstn <- unique(unlist(regmatches(txt, gregexpr("&#[[:digit:]]+;", txt))))
lstn <- data.frame(Number=lstn)
matches <- merge(HTMLChars, lstn)
for (i in 1:nrow(matches)) {
     txt <- gsub(matches$Number[i], matches$Character[i], txt)
}

txt now contains the converted characters.

dput(HTMLChars)
structure(list(Character = c("\"", "'", "&", "<", ">", "", "¡", 
"¢", "£", "¤", "¥", "¦", "§", "¨", "©", "ª", "«", "¬", "­­", 
"®", "¯", "°", "±", "²", "³", "´", "µ", "¶", "·", "¸", "¹", "º", 
"»", "¼", "½", "¾", "¿", "×", "÷", "À", "Á", "Â", "Ã", "Ä", "Å", 
"Æ", "Ç", "È", "É", "Ê", "Ë", "Ì", "Í", "Î", "Ï", "Ð", "Ñ", "Ò", 
"Ó", "Ô", "Õ", "Ö", "Ø", "Ù", "Ú", "Û", "Ü", "Ý", "Þ", "ß", "à", 
"á", "â", "ã", "ä", "å", "æ", "ç", "è", "é", "ê", "ë", "ì", "í", 
"î", "ï", "ð", "ñ", "ò", "ó", "ô", "õ", "ö", "ø", "ù", "ú", "û", 
"ü", "ý", "þ"), Number = c("&#034;", "&#039;", "&#038;", "&#060;", 
"&#062;", "&#160;", "&#161;", "&#162;", "&#163;", "&#164;", "&#165;", 
"&#166;", "&#167;", "&#168;", "&#169;", "&#170;", "&#171;", "&#172;", 
"&#173;", "&#174;", "&#175;", "&#176;", "&#177;", "&#178;", "&#179;", 
"&#180;", "&#181;", "&#182;", "&#183;", "&#184;", "&#185;", "&#186;", 
"&#187;", "&#188;", "&#189;", "&#190;", "&#191;", "&#215;", "&#247;", 
"&#192;", "&#193;", "&#194;", "&#195;", "&#196;", "&#197;", "&#198;", 
"&#199;", "&#200;", "&#201;", "&#202;", "&#203;", "&#204;", "&#205;", 
"&#206;", "&#207;", "&#208;", "&#209;", "&#210;", "&#211;", "&#212;", 
"&#213;", "&#214;", "&#216;", "&#217;", "&#218;", "&#219;", "&#220;", 
"&#221;", "&#222;", "&#223;", "&#224;", "&#225;", "&#226;", "&#227;", 
"&#228;", "&#229;", "&#230;", "&#231;", "&#232;", "&#233;", "&#234;", 
"&#235;", "&#236;", "&#237;", "&#238;", "&#239;", "&#240;", "&#241;", 
"&#242;", "&#243;", "&#244;", "&#245;", "&#246;", "&#248;", "&#249;", 
"&#250;", "&#251;", "&#252;", "&#253;", "&#254;"), Name = c("&quot;", 
"&apos;", "&amp;", "&lt;", "&gt;", "&nbsp;", "&iexcl;", "&cent;", 
"&pound;", "&curren;", "&yen;", "&brvbar;", "&sect;", "&uml;", 
"&copy;", "&ordf;", "&laquo;", "&not;", "&shy;", "&reg;", "&macr;", 
"&deg;", "&plusmn;", "&sup2;", "&sup3;", "&acute;", "&micro;", 
"&para;", "&middot;", "&cedil;", "&sup1;", "&ordm;", "&raquo;", 
"&frac14;", "&frac12;", "&frac34;", "&iquest;", "&times;", "&divide;", 
"&Agrave;", "&Aacute;", "&Acirc;", "&Atilde;", "&Auml;", "&Aring;", 
"&AElig;", "&Ccedil;", "&Egrave;", "&Eacute;", "&Ecirc;", "&Euml;", 
"&Igrave;", "&Iacute;", "&Icirc;", "&Iuml;", "&ETH;", "&Ntilde;", 
"&Ograve;", "&Oacute;", "&Ocirc;", "&Otilde;", "&Ouml;", "&Oslash;", 
"&Ugrave;", "&Uacute;", "&Ucirc;", "&Uuml;", "&Yacute;", "&THORN;", 
"&szlig;", "&agrave;", "&aacute;", "&acirc;", "&atilde;", "&auml;", 
"&aring;", "&aelig;", "&ccedil;", "&egrave;", "&eacute;", "&ecirc;", 
"&euml;", "&igrave;", "&iacute;", "&icirc;", "&iuml;", "&eth;", 
"&ntilde;", "&ograve;", "&oacute;", "&ocirc;", "&otilde;", "&ouml;", 
"&oslash;", "&ugrave;", "&uacute;", "&ucirc;", "&uuml;", "&yacute;", 
"&thorn;")), .Names = c("Character", "Number", "Name"), row.names = c(NA, 
100L), class = "data.frame")

-------
David

> -----Original Message-----
> From: Michael Friendly [mailto:frien...@yorku.ca]
> Sent: Friday, August 10, 2012 12:14 PM
> To: dcarl...@tamu.edu
> Cc: 'R-help'
> Subject: Re: [R] translating HTML character entities to accented
> characters
> 
> Thanks, David
> 
> I need an all-R solution for this, because the author.csv file is
> exported from a database that enforces the HTML
> encoding and the import into R may have to be repeated several times as
> the database is updated.
> 
> -Michael
> 
> On 8/10/2012 12:40 PM, David L Carlson wrote:
> > It's not quite an R solution, but I just pasted your examples into a
> script
> > window in R and saved it as chars.html. Then I opened it in Firefox
> and
> > pasted the results here (with returns inserted to match your
> original).
> >
> >> grep("&", author$lname, value=TRUE)
> > [1] "Frère de Montizon" "Lumière"
> > [3] "Lumière" "Niépce"
> > [5] "Süssmilch" "Schüpbach"
> >> grep("&", author$birthplace, value=TRUE)
> > [1] "Marbach, Württemberg"
> > [2] "Côte-d'Or"
> > [3] "Chalon-sur-Saône, Saône-et-Loire"
> > [4] "Groß Särchen, Germany"
> >> apropos("HTML")
> > For a CSV file you would want to preserve the lines by adding <br> to
> the
> > end of each line first.
> >
> > ----------------------------------------------
> > David L Carlson
> > Associate Professor of Anthropology
> > Texas A&M University
> > College Station, TX 77843-4352
> >
> >
> >
> >> -----Original Message-----
> >> From: r-help-boun...@r-project.org [mailto:r-help-bounces@r-
> >> project.org] On Behalf Of Michael Friendly
> >> Sent: Friday, August 10, 2012 11:15 AM
> >> To: R-help
> >> Subject: [R] translating HTML character entities to accented
> characters
> >>
> >> I've imported a .csv file where character strings that contained
> >> accented characters were written as HTML
> >> character entities.  Is there a function that works on a vector to
> >> translate them back to accented (latin1) characters?
> >>
> >> Some examples:
> >>
> >>   > grep("&", author$lname, value=TRUE)
> >> [1] "Fr&egrave;re de Montizon" "Lumi&egrave;re"
> >> [3] "Lumi&egrave;re"           "Ni&eacute;pce"
> >> [5] "S&uuml;ssmilch"           "Sch&uuml;pbach"
> >>   > grep("&", author$birthplace, value=TRUE)
> >> [1] "Marbach, W&uuml;rttemberg"
> >> [2] "C&ocirc;te-d&#039;Or"
> >> [3] "Chalon-sur-Sa&ocirc;ne, Sa&ocirc;ne-et-Loire"
> >> [4] "Gro&szlig; S&auml;rchen, Germany"
> >>   > apropos("HTML")
> >>
> >> thx,
> >> -Michael
> >>
> >> --
> >> Michael Friendly     Email: friendly AT yorku DOT ca
> >> Professor, Psychology Dept.
> >> York University      Voice: 416 736-2100 x66249 Fax: 416 736-5814
> >> 4700 Keele Street    Web:   http://www.datavis.ca
> >> Toronto, ONT  M3J 1P3 CANADA
> >>
> >> ______________________________________________
> >> R-help@r-project.org mailing list
> >> https://stat.ethz.ch/mailman/listinfo/r-help
> >> PLEASE do read the posting guide http://www.R-project.org/posting-
> >> guide.html
> >> and provide commented, minimal, self-contained, reproducible code.
> 
> 
> --
> Michael Friendly     Email: friendly AT yorku DOT ca
> Professor, Psychology Dept.
> York University      Voice: 416 736-2100 x66249 Fax: 416 736-5814
> 4700 Keele Street    Web:   http://www.datavis.ca
> Toronto, ONT  M3J 1P3 CANADA

______________________________________________
R-help@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.

Reply via email to