On Oct 9, 2015, at 12:59 PM, Omar André Gonzáles Díaz wrote: > I need to extract an ID from the product column of my df. > > I was able to extract the ids for some scenearios, but when applying my > code for the next type of ids (there are some other combinations), the > results of my first line of code got NAs. > > > ripley.tv$id <- sub("(.*)( [0-9]{2}[a-z]{1}[0-9]{4})(.*)", "\\2", > ripley.tv$producto, > ignore.case = T)
I think you need to examine the implicit logic of your pattern argument against the realities of your data: > as.matrix(head(ripley.tv$producto, 10)) [,1] [1,] "SMART TV LED FHD 48\" 3D\n48J6400" [2,] "SMART TV LED FHD 40\" 40J5300" [3,] "TV LED FULL HD 40'' TC-40CS600L" [4,] "TELEVISOR LED LE28F6600 28\"" [5,] "SMART TV 40\" HD LE40K5000N" [6,] "TV LED HD 32'' LE32B7000" [7,] "SMART TV 32'' LE32K5000N" [8,] "TV LED FHD 55\" -\nLE55B8000" [9,] "TV LED LE40B8000 FULL HD 40\"" [10,] "TV LE24B8000 LED HD 24\" - NEGRO" So the first value and most of the other values do not have a space before the segment of characters that you want to match, The third item has two character inside the flanking numbers. I tried fixing that by removing the space and putting in optional length values and discovered that some of you id's have only 3 trailing digit-characters: head( sub("(.*)([0-9]{2}[a-z]{1,2}[0-9]{4})(.*)", "\\2", ripley.tv$producto, ignore.case = T), 20 ) #----------- [1] "48J6400" "40J5300" [3] "TV LED FULL HD 40'' TC-40CS600L" "28F6600" [5] "40K5000" "32B7000" [7] "32K5000" "55B8000" [9] "40B8000" "24B8000" [11] "TV LED FULL HD 42'' TC-42AS610" "50K5000" [13] "40JU6500" "48JU6500" [15] "50JU6500" "55JS9000" [17] "55JU6500" "55JU6700" [19] "55JU7500" "65JS9000" head( sub("(.*)([0-9]{2}[a-z]{1,2}[0-9]{3,4})(.*)", "\\2", ripley.tv$producto, ignore.case = T), 20 ) #-------------- [1] "48J6400" "40J5300" "40CS600" "28F6600" "40K5000" "32B7000" [7] "32K5000" "55B8000" "40B8000" "24B8000" "42AS610" "50K5000" [13] "40JU6500" "48JU6500" "50JU6500" "55JS9000" "55JU6500" "55JU6700" [19] "55JU7500" "65JS9000" > > ripley.tv$id <- sub("(.*)( [0-9]{2}[a-z]{2}[0-9]{4} )(.*)", "\\2", > ripley.tv$producto, > ignore.case = T) > > ripley.tv$id <- sub("(.*)( [a-z]{2}[0-9]{2}[a-z]{1}[0-9]{4} )(.*)", "\\2", > ripley.tv$producto, ignore.case = T) > > > Also I've tried to use the ifelse function, but got "/2" as result. I think you will find that `ifelse` delivers strange results when used with factors as values. It strips attributes. -- david. > > > ripley.tv <- ripley.tv %>% > mutate(id = NA, > id = ifelse(grepl("(.*)([0-9]{2}[A-Z]{1}[0-9]{4})(.*)", > ripley.tv$producto) == T, "\\2",id), > id = ifelse(grepl("(.*)([0-9]{2}[A-Z]{2}[0-9]{4})(.*)", > ripley.tv$producto) == T, "\\2",id), > id = > ifelse(grepl("(.*)([A-Z]{2}[0-9]{2}[A-Z]{1}[0-9]{4})(.*)", ripley.tv$producto) > == T, "\\2",id)) > > > > And also str_extract: > > > ripley.tv$id <- str_extract(ripley.tv$producto, "[0-9]{2}[A-Z]{1}[0-9]{4}") > ripley.tv$id <- str_extract(ripley.tv$producto, "[0-9]{2}[A-Z]{2}[0-9]{4}") > ripley.tv$id <- str_extract(ripley.tv$producto, > "[A-Z]{2}[0-9]{2}[A-Z]{1}[0-9]{4}") > > This is my data: > > > ripley.tv <- structure(list(id = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, > NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, > NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, > NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, > NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, > NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, > NA, NA, NA, NA, NA, NA, NA), marca = c("SAMSUNG", "SAMSUNG", > "PANASONIC", "HAIER", "HAIER", "HAIER", "HAIER", "HAIER", "HAIER", > "HAIER", "PANASONIC", "HAIER", "SAMSUNG", "SAMSUNG", "SAMSUNG", > "SAMSUNG", "SAMSUNG", "SAMSUNG", "SAMSUNG", "SAMSUNG", "SAMSUNG", > "SAMSUNG", "SAMSUNG", "LG", "LG", "LG", "LG", "LG", "LG", "LG", > "LG", "LG", "LG", "LG", "LG", "LG", "PANASONIC", "PANASONIC", > "LG", "LG", "LG", "SONY", "SONY", "SONY", "SAMSUNG", "SAMSUNG", > "SAMSUNG", "SAMSUNG", "SAMSUNG", "SAMSUNG", "SONY", "SONY", "SAMSUNG", > "SAMSUNG", "LG", "LG", "LG", "SONY", "SAMSUNG", "AOC", "PANASONIC", > "SONY", "LG", "AOC", "LG", "SONY", "SONY", "SONY", "SONY", "SONY", > "SONY", "SHARP", "SHARP", "SHARP", "SAMSUNG", "LG", "LG", "LG", > "LG", "SAMSUNG", "SONY", "LG", "LG", "LG", "LG", "LG", "PANASONIC", > "SONY", "SHARP", "SONY", "PANASONIC", "SONY", "SAMSUNG", "AOC", > "PANASONIC", "SHARP", "AOC"), producto = c("SMART TV LED FHD 48\" 3D > 48J6400", > "SMART TV LED FHD 40\" 40J5300", "TV LED FULL HD 40'' TC-40CS600L", > "TELEVISOR LED LE28F6600 28\"", "SMART TV 40\" HD LE40K5000N", > "TV LED HD 32'' LE32B7000", "SMART TV 32'' LE32K5000N", "TV LED FHD 55\" - > LE55B8000", > "TV LED LE40B8000 FULL HD 40\"", "TV LE24B8000 LED HD 24\" - NEGRO", > "TV LED FULL HD 42'' TC-42AS610", "TELEVISOR LED LE50K5000N 50\"", > "SMART TV LED UHD 40\" 40JU6500", "SMART TV ULTRA HD 48'' 48JU6500", > "SMART TV 50JU6500 LED UHD 50\" - NEGRO", "SMART TV ULTRA HD 55'' 3D > 55JS9000", > "SMART TV LED UHD 55\" 55JU6500", "SMART TV ULTRA HD 55'' 55JU6700", > "SMART TV CURVO 55JU7500 LED UHD 55\" 3D - NEGRO", "SMART TV ULTRA HD 65'' > 3D 65JS9000", > "SMART TV 65JU6500 LED UHD 65\"", "SMART TV ULTRA HD 65'' 65JU7500", > "SMART TV LED UHD 75\" 75JU6500", "SMART TV WEB OS 40\" FULL HD 40LF6350", > "SMART TV 3D 42\" FULL HD 42LF6400", "TV LED 42\" FULL HD CINEMA 3D > 42LF6450", > "TV LED 49\" FULL HD CINEMA 3D 49LF6450", "SMART TV LF6400 49\" FULL HD > 3D", > "TV 43UF6750 43\" ULTRA HD 4K", "TV 49\" ULTRA HD 4K 49UF6750", > "TV LED 49\" ULTRA HD SMART UF6900", "SMART TV 49UF7700 49\" ULTRA HD 4K", > "SMART TV 49UF8500 49\" ULTRA HD 4K 3D", "TV LED 55\" CINEMA 3D SMART TV > 55UF7700", > "SMART TV 65UF7700 65\" ULTRA HD 4K", "SMART TV 55UF8500 55\" ULTRA HD 4K > 3D", > "TV LED 55\" ULTRA HD 4K SMART TC-55CX640W", "TV LED 50\" ULTRA HD 4K SMART > TC-50CX640W", > "SMART TV 70UF7700 3D ULTRA HD 70\"", "TV LED CURVO 65\" ULTRA HD 4K CINEMA > SMART UG8700", > "TV LED 60\" FULL HD SMART LF6350", "SMART TV KDL-50FA95C 50\" FULL HD 3D", > "SMART TV KDL50W805C 50\" FULL HD 3D", "TV LED 40\" FULL HD KDL-40R354B", > "SMART TV LED FULL HD 40'' 40J5500", "SMART TV LED FULL HD 50'' 50J5500", > "TV LED HD 32'' 32JH4005", "SMART TV LED FULL HD 50\" 50J5300", > "SMART TV LED 48\" FULL HD 48J5300", "SMART TV FULL HD 40'' 3D 40J6400", > "TV LED 32\" HD SMART KDL-32R505C", "TV LED 40\" SMART FULL HD KDL-40R555C > - NEGRO", > "SMART TV LED FHD 55\" 3D 55J6400", "TV 40JH5005 LED FHD 40\" - NEGRO", > "TV 43\" FULL HD 43LF5410", "SMART TV 32LF585B LED HD 32\" - BLANCO", > "TV LED 49\" FULL HD SMART 49LF5900", "SMART TV 65\" FULL HD 3D > KDL-65W855C", > "SMART TV LED FHD 48\" UN48J6500", "TV LED 40\" FULL HD LE40F1551", > "TV LED 32'' SMART HD TC-32AS600L", "TV LED 32'' HD KDL-32R304B", > "TV OLED 55\" SMART 3D FULL HD 55EC9300 PLATEADO", "TV LED HD 32'' > LE32W454F", > "TV LED 58\" ULTRA HD SMART 58UF8300", "TV LED 55\" FULL HD SMART 3D > KDL-55W805C", > "TV LED 49\" ULTRA HD 4K XBR-49X835C", "TV LED 55\" ULTRA HD 4K > XBR-55X855C", > "TV LED ULTRA DELGADO 55\" ULTRA HD 4K XBR-65X905C", "TV LED 75\" ULTRA HD > 4K 3D XBR-75X945C", > "TV LED ULTRA DELGADO 55\" ULTRA HD 4K XBR-55X905C", "SMART TV LED 60'' > ULTRA HD 4K LC60UE30U", > "SMART TV LED 70'' ULTRA HD 4K LC70UE30U", "SMART TV LED 80'' ULTRA HD 4K > LC80UE30U", > "SMART TV LED FULL HD 48'' 48J5500", "SMART TV CURVO 79UG8800 79\" ULTRA HD > 4K 3D", > "SMART TV 65UF9500 65\" ULTRA HD 4K 3D", "SMART TV 65UF8500 65\" ULTRA HD > 4K 3D", > "SMART TV 55UF9500 55\" ULTRA HD 4K 3D", "SMART TV LED HD 32\" 32J4300", > "TV LED 48\" SMART FULL HD KDL-48R555C - NEGRO", "SMART TV 55UG8700 55\" > ULTRA HD 4K 3D", > "SMART TV 60UF8500 60\" ULTRA HD 4K 3D", "SMART TV 55LF6500 55\" FULL HD > 3D", > "TV 32LF550B 32\" HD", "TV LED 47\" FULL HD 47LB5610", "TV LED FULL HD 50'' > TC-50AS600L", > "TV SMART LED 55\" UHD 3D XBR-55X855B", "TV LED FULL HD 4K LC70SQ17U 70''", > "TV LED SMART UHD 79\" XBR-79X905B", "TV LED FULL HD 40'' TC-40A400L", > "TV LED SMART UHD 70\" XBR-70X855B", "SMART TV UHD 55'' 3D CURVO 55HU8700", > "TV FULL HD LE40D3142 40\" - NEGRO", "TELEVISOR LED 42\" TC-42AS650L", > "SMART TV LCD FHD 70\" LC70LE660", "TV LED FULL HD 58'' LE58D3140" > ), pulgadas = c(48L, 40L, 40L, 28L, 40L, 32L, 32L, 55L, 40L, > 24L, 42L, 50L, 40L, 48L, 50L, 55L, 55L, 55L, 55L, 65L, 65L, 65L, > 75L, 40L, 42L, 42L, 49L, 49L, 43L, 49L, 49L, 49L, 49L, 55L, 65L, > 55L, 55L, 50L, 70L, 65L, 60L, 50L, 50L, 40L, 40L, 50L, 32L, 50L, > 48L, 40L, 32L, 40L, 55L, 40L, 43L, 32L, 49L, 65L, 48L, 40L, 32L, > 32L, 55L, 32L, 58L, 55L, 49L, 55L, 55L, 75L, 55L, 60L, 70L, 80L, > 48L, 79L, 65L, 65L, 55L, 32L, 48L, 55L, 60L, 55L, 32L, 47L, 50L, > 55L, 70L, 79L, 40L, 70L, 55L, 40L, 42L, 70L, 58L), precio.antes = c(2799L, > 1799L, 1699L, 599L, 1299L, 699L, 999L, 1999L, 999L, 499L, 1899L, > 1799L, 2499L, 3999L, 3699L, 10999L, 4299L, 5499L, 6999L, 14999L, > 8999L, 9999L, 14599L, 1999L, 2299L, 2299L, 2899L, 2999L, 2299L, > 23992L, 3599L, 3799L, 4799L, 4999L, 8499L, 5999L, 4999L, 3999L, > 11999L, 10999L, 4399L, 4499L, 3799L, 1399L, 2299L, 2799L, 999L, > 2199L, 2299L, 2299L, 1299L, 1699L, 3499L, 1399L, 1549L, 1299L, > 2399L, 6499L, 2999L, 999L, 1249L, 999L, 14999L, 799L, 5999L, > 4499L, 4999L, 6499L, 12999L, 24999L, 8999L, 5999L, 7599L, 14999L, > 2499L, 29999L, 13999L, 9999L, 9699L, 1299L, 2399L, 6999L, 7999L, > 3699L, 999L, 1899L, 2999L, 7999L, 8499L, 24999L, 1399L, 13999L, > 8499L, 999L, 2599L, 5799L, 2399L), precio.nuevo = c(2299, 1399, > 1299, 549, 1099, 629, 799, 1699, 849, 439, 1499, 1549, 1759.2, > 2099.3, 2309.3, 7699.3, 2799.3, 3639.3, 4899.3, 10499.3, 5109.3, > 6999.3, 10219.3, 1399, 1599, 1599, 2199, 2199, 1299, 23992, 2299, > 2299, 2899, 2999, 5999, 3899, 4999, 3999, 8999, 6999, 4099, 3999, > 3499, 1299, 1799, 2399, 799, 2199, 1799, 1999, 1199, 1599, 2999, > 1199, 1399, 1099, 1999, 5999, 2799, 999, 1199, 949, 7999, 799, > 5299, 4299, 3999, 5999, 11999, 23999, 7999, 5699, 7599, 14499, > 2399, 29999, 11999, 8999, 7499, 1099, 2199, 6599, 7099, 3599, > 899, 1599, 2199, 4999, 6499, 19999, 1399, 9999, 5999, 999, 2599, > 5699, 2399), dif.precios = c(500, 400, 400, 50, 200, 70, 200, > 300, 150, 60, 400, 250, 739.8, 1899.7, 1389.7, 3299.7, 1499.7, > 1859.7, 2099.7, 4499.7, 3889.7, 2999.7, 4379.7, 600, 700, 700, > 700, 800, 1000, 0, 1300, 1500, 1900, 2000, 2500, 2100, 0, 0, > 3000, 4000, 300, 500, 300, 100, 500, 400, 200, 0, 500, 300, 100, > 100, 500, 200, 150, 200, 400, 500, 200, 0, 50, 50, 7000, 0, 700, > 200, 1000, 500, 1000, 1000, 1000, 300, 0, 500, 100, 0, 2000, > 1000, 2200, 200, 200, 400, 900, 100, 100, 300, 800, 3000, 2000, > 5000, 0, 4000, 2500, 0, 0, 100, 0), dif.porcentual = c(17.86, > 22.23, 23.54, 8.35, 15.4, 10.01, 20.02, 15.01, 15.02, 12.02, > 21.06, 13.9, 29.6, 47.5, 37.57, 30, 34.88, 33.82, 30, 30, 43.22, > 30, 30, 30.02, 30.45, 30.45, 24.15, 26.68, 43.5, 0, 36.12, 39.48, > 39.59, 40.01, 29.42, 35.01, 0, 0, 25, 36.37, 6.82, 11.11, 7.9, > 7.15, 21.75, 14.29, 20.02, 0, 21.75, 13.05, 7.7, 5.89, 14.29, > 14.3, 9.68, 15.4, 16.67, 7.69, 6.67, 0, 4, 5.01, 46.67, 0, 11.67, > 4.45, 20, 7.69, 7.69, 4, 11.11, 5, 0, 3.33, 4, 0, 14.29, 10, > 22.68, 15.4, 8.34, 5.72, 11.25, 2.7, 10.01, 15.8, 26.68, 37.5, > 23.53, 20, 0, 28.57, 29.42, 0, 0, 1.72, 0), rangos = c("S/.1500 - S/.2500", > "S/.500 - S/.1500", "S/.500 - S/.1500", "S/.500 - S/.1500", "S/.500 - > S/.1500", > "S/.500 - S/.1500", "S/.500 - S/.1500", "S/.1500 - S/.2500", > "S/.500 - S/.1500", "< S/.500", "S/.500 - S/.1500", "S/.1500 - S/.2500", > "S/.1500 - S/.2500", "S/.1500 - S/.2500", "S/.1500 - S/.2500", > "> S/.4,500", "S/.2500 - S/.3500", "S/.3500 - S/.4500", "> S/.4,500", > "> S/.4,500", "> S/.4,500", "> S/.4,500", "> S/.4,500", "S/.500 - S/.1500", > "S/.1500 - S/.2500", "S/.1500 - S/.2500", "S/.1500 - S/.2500", > "S/.1500 - S/.2500", "S/.500 - S/.1500", "> S/.4,500", "S/.1500 - S/.2500", > "S/.1500 - S/.2500", "S/.2500 - S/.3500", "S/.2500 - S/.3500", > "> S/.4,500", "S/.3500 - S/.4500", "> S/.4,500", "S/.3500 - S/.4500", > "> S/.4,500", "> S/.4,500", "S/.3500 - S/.4500", "S/.3500 - S/.4500", > "S/.2500 - S/.3500", "S/.500 - S/.1500", "S/.1500 - S/.2500", > "S/.1500 - S/.2500", "S/.500 - S/.1500", "S/.1500 - S/.2500", > "S/.1500 - S/.2500", "S/.1500 - S/.2500", "S/.500 - S/.1500", > "S/.1500 - S/.2500", "S/.2500 - S/.3500", "S/.500 - S/.1500", > "S/.500 - S/.1500", "S/.500 - S/.1500", "S/.1500 - S/.2500", > "> S/.4,500", "S/.2500 - S/.3500", "S/.500 - S/.1500", "S/.500 - S/.1500", > "S/.500 - S/.1500", "> S/.4,500", "S/.500 - S/.1500", "> S/.4,500", > "S/.3500 - S/.4500", "S/.3500 - S/.4500", "> S/.4,500", "> S/.4,500", > "> S/.4,500", "> S/.4,500", "> S/.4,500", "> S/.4,500", "> S/.4,500", > "S/.1500 - S/.2500", "> S/.4,500", "> S/.4,500", "> S/.4,500", > "> S/.4,500", "S/.500 - S/.1500", "S/.1500 - S/.2500", "> S/.4,500", > "> S/.4,500", "S/.3500 - S/.4500", "S/.500 - S/.1500", "S/.1500 - S/.2500", > "S/.1500 - S/.2500", "> S/.4,500", "> S/.4,500", "> S/.4,500", > "S/.500 - S/.1500", "> S/.4,500", "> S/.4,500", "S/.500 - S/.1500", > "S/.2500 - S/.3500", "> S/.4,500", "S/.1500 - S/.2500")), .Names = c("id", > "marca", "producto", "pulgadas", "precio.antes", "precio.nuevo", > "dif.precios", "dif.porcentual", "rangos"), class = "data.frame", row.names > = c(NA, > -97L)) > > [[alternative HTML version deleted]] > > ______________________________________________ > R-help@r-project.org mailing list -- To UNSUBSCRIBE and more, see > https://stat.ethz.ch/mailman/listinfo/r-help > PLEASE do read the posting guide http://www.R-project.org/posting-guide.html > and provide commented, minimal, self-contained, reproducible code. David Winsemius Alameda, CA, USA ______________________________________________ R-help@r-project.org mailing list -- To UNSUBSCRIBE and more, see https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code.