Accented characters in itemset.csv

chrissyhroberts · September 13, 2018, 4:08pm

I am using ODK XLSForm Offline v 1.7.0 on OS X 10.13.6

My form design includes use of select_one_external so an itemset.csv file is created

The various labels in the itemset include accented characters such as é

When I convert the form, the accented characters become transliterated

for instance

Équateur becomes √âquateur

Clearly this is an issue with UTF-8 etc, but not sure if there is an end user fix for this.
When using a cascading select within the normal 'choices' sheet, I don't have this problem, so I think an isolated problem with encoding the itemset.csv file.

Please advise on whether this is possible to fix.

Thanks

What ODK tool and version are you using? And on what device and operating system version?

What steps can we take to reproduce the problem?

What you have you tried to fix the problem?

Anything else we should know or have? If you have a test form or screenshots or logs, attach here.

aurdipas · September 13, 2018, 4:22pm

I faced this kind of issues at one point.
What I did to fix it: I created the itemset file with LibreOffice and I saved as csv.
When you do it with Libre Office what you get is a prompt asking to choose the Character set and there I checked UTF-8.
Let me know if this solves your problem.

Trung · September 13, 2018, 4:32pm

Another trick would be exporting csv from Google Sheet.

chrissyhroberts · September 14, 2018, 11:40am

Thanks for the simple fix ideas. I followed this advice and used another system to make the file.

For others this script may be useful. It uses R to process country/region/district/subdistrict data from public data sets and makes an itemsets.csv file from them.

Chrissy h

###############################################
# Using Administrative data from DIVA-GIS,
# this script will convert the geographical
# administrative data (levels 0-3)
# for one or more country
# and output an itemsets.csv file
# that works with ODK and preserves accented
# characters (tested only on roman characters)
# Also removes accents from internal variables
# like 'name'. Accents will appear on screen but
# won't be preserved in data frames.
################################################
#MIT LICENCE Copyright © 2018 <chrissy.roberts@lshtm.ac.uk>
################################################
# INSTRUCTIONS
# First get administrative data from http://www.diva-gis.org/datadown
# for each of the countries you want to include
# unzip to a folder in the same directory as this script
# this example uses Uganda and Democratic Republic of Congo.


# Specify ISO codes (3 digit) to tell R which data sets to include
countries<-c("UGA","COD")

#for each country, find the data set in the folder structure and load up then bind together
for(i in 1:length(countries))
{
level3data<-read.csv(list.files(pattern = paste(countries[i],"_adm3.csv",sep=""),full.names = T,recursive = T))
level3data<-level3data[,c("NAME_0","NAME_1","NAME_2","NAME_3","ISO")]
if(i==1){full.data.output<-level3data}
if(i!=1){full.data.output<-rbind(full.data.output,level3data)}
}

#set correct names so that label is descriptive country name
names(full.data.output)[which(names(full.data.output)=="NAME_0")]<-"label"
#set correct names so that ISO code is NAME_0 (level 0 country data)
names(full.data.output)[which(names(full.data.output)=="ISO")]<-"NAME_0"

#get rid of whitespace and dots and so on [might need to add more]
full.data.output<-full.data.output[,c("NAME_0","NAME_1","NAME_2","NAME_3","label")]
full.data.output$NAME_0<-gsub(full.data.output$NAME_0,pattern = "/| |'|//.|",replacement = "_")
full.data.output$NAME_1<-gsub(full.data.output$NAME_1,pattern = "/| |'|//.",replacement = "_")
full.data.output$NAME_2<-gsub(full.data.output$NAME_2,pattern = "/| |'|//.",replacement = "_")
full.data.output$NAME_3<-gsub(full.data.output$NAME_3,pattern = "/| |'|//.",replacement = "_")

#find level zero and blank out levels 1,2,3
level0<-full.data.output
level0[,2:4]<-""
level0<-unique(level0)
level0$list_name<-"NAME_0"


#find level one and blank out levels 2,3
level1<-full.data.output[,c(2,1,3,4,5)]
level1[,3:4]<-""
level1<-unique(level1)
level1$list_name<-"NAME_1"
level1$label<-level1$NAME_1

#find level two and blank out level 3
level2<-full.data.output[,c(3,1,2,4,5)]
level2[,4]<-""
level2<-unique(level2)
level2$list_name<-"NAME_2"
level2$label<-level2$NAME_2

#find level three
level3<-full.data.output[,c(4,1,2,3,5)]
level3<-unique(level3)
level3$list_name<-"NAME_3"
level3$label<-level3$NAME_3

#put it all together
output<-level0
output<-rbind(output,level1)
output<-rbind(output,level2)
output<-rbind(output,level3)

#define function to remove accents from text
removeAccents<-function(x)
{
a <- c('À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'ÿ', 'Ā', 'ā', 'Ă', 'ă', 'Ą', 'ą', 'Ć', 'ć', 'Ĉ', 'ĉ', 'Ċ', 'ċ', 'Č', 'č', 'Ď', 'ď', 'Đ', 'đ', 'Ē', 'ē', 'Ĕ', 'ĕ', 'Ė', 'ė', 'Ę', 'ę', 'Ě', 'ě', 'Ĝ', 'ĝ', 'Ğ', 'ğ', 'Ġ', 'ġ', 'Ģ', 'ģ', 'Ĥ', 'ĥ', 'Ħ', 'ħ', 'Ĩ', 'ĩ', 'Ī', 'ī', 'Ĭ', 'ĭ', 'Į', 'į', 'İ', 'ı', 'Ĳ', 'ĳ', 'Ĵ', 'ĵ', 'Ķ', 'ķ', 'Ĺ', 'ĺ', 'Ļ', 'ļ', 'Ľ', 'ľ', 'Ŀ', 'ŀ', 'Ł', 'ł', 'Ń', 'ń', 'Ņ', 'ņ', 'Ň', 'ň', 'ŉ', 'Ō', 'ō', 'Ŏ', 'ŏ', 'Ő', 'ő', 'Œ', 'œ', 'Ŕ', 'ŕ', 'Ŗ', 'ŗ', 'Ř', 'ř', 'Ś', 'ś', 'Ŝ', 'ŝ', 'Ş', 'ş', 'Š', 'š', 'Ţ', 'ţ', 'Ť', 'ť', 'Ŧ', 'ŧ', 'Ũ', 'ũ', 'Ū', 'ū', 'Ŭ', 'ŭ', 'Ů', 'ů', 'Ű', 'ű', 'Ų', 'ų', 'Ŵ', 'ŵ', 'Ŷ', 'ŷ', 'Ÿ', 'Ź', 'ź', 'Ż', 'ż', 'Ž', 'ž', 'ſ', 'ƒ', 'Ơ', 'ơ', 'Ư', 'ư', 'Ǎ', 'ǎ', 'Ǐ', 'ǐ', 'Ǒ', 'ǒ', 'Ǔ', 'ǔ', 'Ǖ', 'ǖ', 'Ǘ', 'ǘ', 'Ǚ', 'ǚ', 'Ǜ', 'ǜ', 'Ǻ', 'ǻ', 'Ǽ', 'ǽ', 'Ǿ', 'ǿ');
b <- c('A', 'A', 'A', 'A', 'A', 'A', 'AE', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', 'D', 'N', 'O', 'O', 'O', 'O', 'O', 'O', 'U', 'U', 'U', 'U', 'Y', 's', 'a', 'a', 'a', 'a', 'a', 'a', 'ae', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', 'n', 'o', 'o', 'o', 'o', 'o', 'o', 'u', 'u', 'u', 'u', 'y', 'y', 'A', 'a', 'A', 'a', 'A', 'a', 'C', 'c', 'C', 'c', 'C', 'c', 'C', 'c', 'D', 'd', 'D', 'd', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'G', 'g', 'G', 'g', 'G', 'g', 'G', 'g', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', 'IJ', 'ij', 'J', 'j', 'K', 'k', 'L', 'l', 'L', 'l', 'L', 'l', 'L', 'l', 'l', 'l', 'N', 'n', 'N', 'n', 'N', 'n', 'n', 'O', 'o', 'O', 'o', 'O', 'o', 'OE', 'oe', 'R', 'r', 'R', 'r', 'R', 'r', 'S', 's', 'S', 's', 'S', 's', 'S', 's', 'T', 't', 'T', 't', 'T', 't', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'W', 'w', 'Y', 'y', 'Y', 'Z', 'z', 'Z', 'z', 'Z', 'z', 's', 'f', 'O', 'o', 'U', 'u', 'A', 'a', 'I', 'i', 'O', 'o', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'A', 'a', 'AE', 'ae', 'O', 'o');
for(i in 1:length(a))
{
x<-gsub(x = x,pattern = a[i],replacement = b[i])
}
return(x)
}

# remove accents from name, levels 0-3, leaving them only in label.
output<-output[,c("list_name","NAME_0","NAME_1","NAME_2","NAME_3","label")]
output$name<-output$label
output$name<-removeAccents(output$name)
output$NAME_0<-removeAccents(output$NAME_0)
output$NAME_1<-removeAccents(output$NAME_1)
output$NAME_2<-removeAccents(output$NAME_2)
output$NAME_3<-removeAccents(output$NAME_3)

head(output)

#write output to itemsets.csv
write.csv(x = output,file = "itemsets.csv",quote = F,row.names = F)

chrissyhroberts · September 14, 2018, 1:59pm

slightly modified version of script above.
This one adds an 'other' option to every relevant place

###############################################
# Using Administrative dara from DIVA-GIS,
# this script will convert the geographical
# administrative data (levels 0-3)
# for one or more country
# and output an itemsets.csv file
# that works with ODK and preserves accented
# characters (tested only on roman characters)
# Also removes accents from internal variables
# like 'name'. Accents will appear on screen but
# won't be preserved in data frames.
################################################
#MIT LICENCE Copyright © 2018 <chrissy.roberts@lshtm.ac.uk>
################################################
# INSTRUCTIONS
# First get administrative data from http://www.diva-gis.org/datadown
# for each of the countries you want to include
# unzip to a folder in the same directory as this script
# this example uses Uganda and Democratic Republic of Congo.


# Specify ISO codes (3 digit) to tell R which data sets to include
countries<-c("UGA","COD")

#for each country, find the data set in the folder structure and load up then bind together
for(i in 1:length(countries))
{
level3data<-read.csv(list.files(pattern = paste(countries[i],"_adm3.csv",sep=""),full.names = T,recursive = T))
level3data<-level3data[,c("NAME_0","NAME_1","NAME_2","NAME_3","ISO")]
if(i==1){full.data.output<-level3data}
if(i!=1){full.data.output<-rbind(full.data.output,level3data)}
}

#set correct names so that label is descriptive country name
names(full.data.output)[which(names(full.data.output)=="NAME_0")]<-"label"
#set correct names so that ISO code is NAME_0 (level 0 country data)
names(full.data.output)[which(names(full.data.output)=="ISO")]<-"NAME_0"

#get rid of whitespace and dots and so on [might need to add more]
full.data.output<-full.data.output[,c("NAME_0","NAME_1","NAME_2","NAME_3","label")]
full.data.output$NAME_1<-gsub(full.data.output$NAME_1,pattern = "/| |'|//.",replacement = "_")
full.data.output$NAME_2<-gsub(full.data.output$NAME_2,pattern = "/| |'|//.",replacement = "_")
full.data.output$NAME_3<-gsub(full.data.output$NAME_3,pattern = "/| |'|//.",replacement = "_")

#find level zero and blank out levels 1,2,3
level0<-full.data.output
level0[,2:4]<-""
level0<-unique(level0)
level0$list_name<-"NAME_0"


#find level one and blank out levels 2,3
level1<-full.data.output[,c(2,1,3,4,5)]
level1[,3:4]<-""
level1<-unique(level1)
level1$list_name<-"NAME_1"
level1$label<-level1$NAME_1
x<-level1
x$NAME_1<-""
x$label<-""
x<-unique(x)
x$NAME_1<-"Other"
x$label<-"Other"
level1<-rbind(level1,x)
rm(x)


#find level two and blank out level 3
level2<-full.data.output[,c(3,1,2,4,5)]
level2[,4]<-""
level2<-unique(level2)
level2$list_name<-"NAME_2"
level2$label<-level2$NAME_2
x<-level2
x$NAME_2<-""
x$label<-""
x<-unique(x)
x$NAME_2<-"Other"
x$label<-"Other"
level2<-rbind(level2,x)
rm(x)

#find level three
level3<-full.data.output[,c(4,1,2,3,5)]
level3<-unique(level3)
level3$list_name<-"NAME_3"
level3$label<-level3$NAME_3
x<-level3
x$NAME_3<-""
x$label<-""
x<-unique(x)
x$NAME_3<-"Other"
x$label<-"Other"
level3<-rbind(level3,x)
rm(x)


#put it all together
output<-level0
output<-rbind(output,level1)
output<-rbind(output,level2)
output<-rbind(output,level3)

#define function to remove accents from text
removeAccents<-function(x)
{
a <- c('À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'ÿ', 'Ā', 'ā', 'Ă', 'ă', 'Ą', 'ą', 'Ć', 'ć', 'Ĉ', 'ĉ', 'Ċ', 'ċ', 'Č', 'č', 'Ď', 'ď', 'Đ', 'đ', 'Ē', 'ē', 'Ĕ', 'ĕ', 'Ė', 'ė', 'Ę', 'ę', 'Ě', 'ě', 'Ĝ', 'ĝ', 'Ğ', 'ğ', 'Ġ', 'ġ', 'Ģ', 'ģ', 'Ĥ', 'ĥ', 'Ħ', 'ħ', 'Ĩ', 'ĩ', 'Ī', 'ī', 'Ĭ', 'ĭ', 'Į', 'į', 'İ', 'ı', 'Ĳ', 'ĳ', 'Ĵ', 'ĵ', 'Ķ', 'ķ', 'Ĺ', 'ĺ', 'Ļ', 'ļ', 'Ľ', 'ľ', 'Ŀ', 'ŀ', 'Ł', 'ł', 'Ń', 'ń', 'Ņ', 'ņ', 'Ň', 'ň', 'ŉ', 'Ō', 'ō', 'Ŏ', 'ŏ', 'Ő', 'ő', 'Œ', 'œ', 'Ŕ', 'ŕ', 'Ŗ', 'ŗ', 'Ř', 'ř', 'Ś', 'ś', 'Ŝ', 'ŝ', 'Ş', 'ş', 'Š', 'š', 'Ţ', 'ţ', 'Ť', 'ť', 'Ŧ', 'ŧ', 'Ũ', 'ũ', 'Ū', 'ū', 'Ŭ', 'ŭ', 'Ů', 'ů', 'Ű', 'ű', 'Ų', 'ų', 'Ŵ', 'ŵ', 'Ŷ', 'ŷ', 'Ÿ', 'Ź', 'ź', 'Ż', 'ż', 'Ž', 'ž', 'ſ', 'ƒ', 'Ơ', 'ơ', 'Ư', 'ư', 'Ǎ', 'ǎ', 'Ǐ', 'ǐ', 'Ǒ', 'ǒ', 'Ǔ', 'ǔ', 'Ǖ', 'ǖ', 'Ǘ', 'ǘ', 'Ǚ', 'ǚ', 'Ǜ', 'ǜ', 'Ǻ', 'ǻ', 'Ǽ', 'ǽ', 'Ǿ', 'ǿ');
b <- c('A', 'A', 'A', 'A', 'A', 'A', 'AE', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', 'D', 'N', 'O', 'O', 'O', 'O', 'O', 'O', 'U', 'U', 'U', 'U', 'Y', 's', 'a', 'a', 'a', 'a', 'a', 'a', 'ae', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', 'n', 'o', 'o', 'o', 'o', 'o', 'o', 'u', 'u', 'u', 'u', 'y', 'y', 'A', 'a', 'A', 'a', 'A', 'a', 'C', 'c', 'C', 'c', 'C', 'c', 'C', 'c', 'D', 'd', 'D', 'd', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'G', 'g', 'G', 'g', 'G', 'g', 'G', 'g', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', 'IJ', 'ij', 'J', 'j', 'K', 'k', 'L', 'l', 'L', 'l', 'L', 'l', 'L', 'l', 'l', 'l', 'N', 'n', 'N', 'n', 'N', 'n', 'n', 'O', 'o', 'O', 'o', 'O', 'o', 'OE', 'oe', 'R', 'r', 'R', 'r', 'R', 'r', 'S', 's', 'S', 's', 'S', 's', 'S', 's', 'T', 't', 'T', 't', 'T', 't', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'W', 'w', 'Y', 'y', 'Y', 'Z', 'z', 'Z', 'z', 'Z', 'z', 's', 'f', 'O', 'o', 'U', 'u', 'A', 'a', 'I', 'i', 'O', 'o', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'A', 'a', 'AE', 'ae', 'O', 'o');
for(i in 1:length(a))
{
x<-gsub(x = x,pattern = a[i],replacement = b[i])
}
return(x)
}

# remove accents from name, levels 0-3, leaving them only in label.
output<-output[,c("list_name","NAME_0","NAME_1","NAME_2","NAME_3","label")]
output$name<-output$label
output$name<-removeAccents(output$name)
output$NAME_0<-removeAccents(output$NAME_0)
output$NAME_1<-removeAccents(output$NAME_1)
output$NAME_2<-removeAccents(output$NAME_2)
output$NAME_3<-removeAccents(output$NAME_3)

head(output)

#write output to itemsets.csv
write.csv(x = output,file = "itemsets.csv",quote = F,row.names = F)