Upload files in R

Upload files from Excel, STATA, SAS, SPSS and text

First set the working directory (or check it)

getwd() # get working directory
 [1] "/Users/me/My Folder/"
 setwd("./My Subfolder/") # set working directory

1. .csv and .txt files

the read.csv function has many options, some of them are header=T which sets the first line as column names, sep=“,” the field separator character (in this case the semicolon), dec=“.” decimal sep character, skip=2 number of lines to skip (in this case 2).

read.csv2 is identical to read.csv except it assumes commas to be the decimal operators and semicolon as field separator

read.table works similarly to read.csv, but reads text files.

 mydata <- read.csv("mydata.csv", header=T)

When importing data in R, if any column’s name is a number, R will add an X to it (as in general it is a very bad idea to have numbers for column names, but can be handy). You can replace column names with:

 colnames(mydata) <- c("name1", "name2", "name3", "2017", "2018", "2019")

If you change or add anything to your data and want to save it then ( write.table for txt output):

 write.csv(mydata, "mydata.csv", row.names=FALSE)

2. STATA files .dta

 write.dta(mydata, "mydata.dta")

3. SPSS files .sav

use.value.labels by default is TRUE and converts value labels into factors. The mydata.txt is the name for data output, while the mydata.sps is the code output.

 mydata <- read.spss("mydata", to.data.frame=T, use.value.labels = FALSE)
 write.foreign(as.data.frame(mydata), "mydata.txt", "mydata.sps", package="SPSS")

4. SAS files .sas

Note that by default it converts value labels into factors

## to read from SAS
 mydata <- sasxport.get("mydata.xpt")

## to save in SAS format
 write.foreign(as.data.frame(mydata), "mydata.txt", "mydata.sas", package="SAS")

5. Excel spreadsheet

# library(xlsx)
 mydata <- read.xlsx("c:/myexcel.xlsx", 1) # 1 refers to the first worksheet-page altrenatively...
 mydata <- read.xlsx("c:/myexcel.xlsx", sheetName="Data input")
 write.xlsx(mydata, "mydata.xlsx")
# library(readxl)
mydata <-system.file("mypath/myexcel.xlsx", package = "readxl")
mydata <- read_excel(mydata, 1)

(A few) quick tricks

# head(mydata, n=10) # first 10 rows
 tail(mydata, n=10) # last 10 rows
 mydata[1,1:10] # print first row and first 10 columns
 names(mydata) # variable names
 nrow(mydata) # number of rows
 ncol(mydata) # number of columns
 str(mydata) # list structure of data
 class(mydata) # class of data
 view(mydata) # opens viewer window

A map of the US election results

  1. Upload libraries:
rm(list = ls(all=T)) #clear workspace
library(maptools) #to read shapefiles

2. Download the data files (note they are not ready for use but need some cleaning as there are more areas in the excel files than polygons in the shape file). I copy here the code as I have used it in my script but it’s available at RPubs thanks to David Robinson.

download.file("http://www2.census.gov/prod2/statcomp/usac/excel/LND01.xls", "LND01.xls")
download.file("http://www2.census.gov/prod2/statcomp/usac/excel/POP01.xls", "POP01.xls")

according to metadata, this is Land Area in 2010 and resident population in 2010:

us_county_area <- read_excel("LND01.xls") 
transmute(CountyCode = as.character(as.integer(STCOU)), Area = LND110210D)

us_county_population <- read_excel("POP01.xls") 
transmute(CountyCode = as.character(as.integer(STCOU)),Population = POP010210D)

3. Adjust data

election_url <- "https://raw.githubusercontent.com/Prooffreader/election_2016_data/master/data/presidential_general_election_2016_by_county.csv"
county_data <- read_csv(election_url) 
group_by(CountyCode = as.character(fips)) 
mutate(TotalVotes = sum(votes)) 
mutate(name = str_replace(name, ".\\. ", "")) 
filter(name %in% c("Trump", "Clinton", "Johnson", "Stein")) 
transmute(County = str_replace(geo_name, " County", ""),
State = state,
CountyCode = as.character(fips),
Candidate = name,
Percent = vote_pct / 100,
spread(Candidate, Percent, fill = 0) 
inner_join(us_county_population, by = "CountyCode") 
inner_join(us_county_area, by = "CountyCode")

you can save the data into a csv file:

# write_csv(county_data, "county_election_2016.csv")

You can download the cleaned datafile here: data_election_2016_by_county

4. Upload data and shape files

dt <- read.csv("new_county_election_2016.csv", header=T)
us <- readShapePoly("./USA_adm/USA_adm2.shp")
us0 <- readShapePoly("./USA_adm/USA_adm0.shp")
us.m <- us[-c(which(us$NAME_1=="Alaska")),] #get rid of Alaska
us.d <- us.m[-c(67:71),]

5. Prepare the color palette(s)

nclassint <- 5 #number of colors to be used in the palette
cat.T <- classIntervals(dt$Trump[-c(67:71)], nclassint,style = "jenks") #style refers to how the breaks are created
colpal.T <- brewer.pal(nclassint,"Reds")
color.T <- findColours(cat.T,colpal.T) #sequential
bins.T <- cat.T$brks
lb.T <- length(bins.T)

5. Plot the maps with map basic

# pdf("Where are the trump voters.pdf")
# plot(us.d, col=color.T, border=F)
# plot(us0,add=T, lwd=0.1)
# legend("bottomleft",fill=colpal.T,legend=paste(round(bins[-length(bins.T)],1),":",round(bins.T[-1],1)),cex=1, bg="white")
# dev.off()
% Votes for Clinton
% Votes for Trump

… or ggplot2


ggplot(county_data, aes(Population / Area, Trump)) +
  geom_point() +
  geom_point(data=county_data[which(county_data$State=="Texas"),], aes(x=Population/Area, y=Trump), colour="red")+
  scale_x_log10() +
  scale_y_continuous(labels = percent_format()) +
  xlab("Population density (ppl / square mile)") +
  ylab("% of votes going to Trump") +
  geom_text(aes(label = County), vjust = 1, hjust = 1, check_overlap = TRUE) +
  geom_smooth(method = "lm") +
  ggtitle("Population density vs Trump voters by county (Texas Counties in red)")

This is the code to plot in red points according to State (in red) and to add red labels to those points. The check_overlap=T avoids overlapping labels.

# ggplot(county_data, aes(Population / Area, Trump)) +
#   geom_point() +
#   geom_point(data=county_data[which(county_data$State=="California"),], aes(x=Population/Area, y=Trump), colour="red")+
#   scale_x_log10() +
#   scale_y_continuous(labels = percent_format()) +
#   xlab("Population density (ppl / square mile)") +
#   ylab("% of votes going to Trump") +
#   geom_text(data=county_data[which(county_data$State=="California"),], aes(label = ifelse(Trump&amp;gt;.5, as.character(dt$County), "" )), color= "red",size=5,vjust = 1, hjust = 1, check_overlap = TRUE) +
#   geom_smooth(method = "lm") +
#   ggtitle("Population density vs Trump voters by county (California in red)")



Saving graphics in R

A brief minimal guide on saving graphics in R

This is intended to be a minimalistic guide on how to save graphics in an R environment giving tips on formats and codes.

1. What format?

a. Vector files like PDF, EPS, PS, SVG: high quality, easility resizable and works in any anvironment. In particular, I find PDF to work great with LaTeX, ppt, and word. pdf(“mygraph.pdf”).
b. WMF: easily resizable but works only in a Windows environment. I don’t own or work with Windows, so I have never used this format. The general command is win.metafile(“mygraph.wmf”). I personally despise word as a writing tool, I wrote my master thesis in it and it was a nightmare, but if you really need to use it… If you have a MAC (and you are still using word) I recommend you take a look at this website for inspiration. If you work in a Windows environment free alternatives here, and here (mostly for reports or lecture notes, but I know people who write entire articles), not free here, …And LaTeX for All. If you work with Linux you’re porbably laughing.
c. JPG   –> never use jpg formats
d. PNG, TIFF are bitmap (or raster) formats, preferable for raster graphics, such as photos. png(“myplot.png”) or tiff(“myplot.tiff”). Good to know: to make more than one page of  graphs add the -%d. as in png(“plot-%d.png”) see example 3.
e. svg is another vector format, like pdf or eps. Default settings for svg() does not allow for multiple pages in a single file

f. one extra mention for the .eps format, the one I normally use and that I find the most practical. I use it to store all graphs for the most disparate purposes: to include them into a LaTeX document (it will just transform your .eps files into .pdf(s) and add them to your library), for presentations in ppt, keynote or LaTeX (again) and publications. Windows usually does not visualize authomatically encapsulated scripts **but** if you own a Windows machine, you can always download a program such as Ghostscript ,GIMP , Photoshop , or EPS viewer


2. How to use it?

This works with most plotting libraries: (1) first call your format saving line (e.g. pdf, png, jpeg, postscript…), (2) plot commands, (3) dev.off()

dev.off() tells to stop saving whatever you are plotting, meaning that if you don’t call it you may end up with a bunch of graphs on the same page.

example 1:
plot(x, y)
dev.off() #dev.off() closes the graphics device, it stops the saving of any further plotting commands, so be sure to add it when you are done with plotting

Alternatively you can use the dev.print command, which produces postscript prints:

example 2:
plot(x, y)
dev.print(pdf, "myplot.pdf") #here I use pdf, but it can be any other format... see this link
dev.off ()

example 3:  Multiple pages: 1 plot per page…
plot(x, y)

plot(x, y)

3. What if I am using ggplot2 or ggmap?
ggsave You only need to specify the filename, it’s very convenient for quick plots. It saves the last plot that you displayed, the default size is the size of the current graphics device, unless otherwise specified in height and width and the unit measure can be in cm, inches or mm units =(“cm”, “inches”, “mm”) . It guesses the type of graphics device from the extension: see this for more details
Of course, you need plot in ggplot2 to use ggsave…

4. What size?
The codes above format plots according to the size in which they are diplayed in R or by default values (in inches). Sizing can be controlled via width, height.

pdf("myplot.pdf", width=10, height=5)
a # I am using code from this post

5. How to customize graphsThis works with pdf() and postscript() -I always use postscript…-
As usual, like most things in R, everything is highly customizable, for instance you can:
1. Define the font family to be used via the family option. The default is Helvetica but you can find an exhaustive list of fonts here
2. bg changes the background color (I usually set it to transparent bg= “transparent”, so that I don’t have problems when using graphs for presentations, especially if they are in powerpoint or keynote)
3. horizontal direction of the printed image, if set to FALSE it’s vertical

… and much more… Those three are the ones that I most commonly use.


To paraphrase Röyksopp, what else is there?

Well, a lot…

  1. the Cairo package (see this link)  to export anti-aliased, high resolution plots in R for Windows
  2. I have purposedly avoided mentioning the lattice package, mostly because I don’t use it. Lattice is a trellis graphics system that exists in parallel with the normal R graphics system and the graph exporting system is a bit different from that of other environments. A great intro for anyone who wants a go at lattice is this set of slides. In general, when plotting in R you have plenty of choice and usually one environment (either ggplot, plot basic, or lattice just to mention some) is enough to do everything.
  3. Some journals have strict requirements for  graphs quality, and the .eps format called via postscript(“myveryniceplot.epc”,  paper= “special” , onefile= FALSE) seems to do the trick
  4. The whole world of interactive plotting: see ggvis, plotly, htmlwidgets, googleVis, and shiny (just to mention a few)

Arranging ggplot2 graphs on a page

How to arrange graphs in ggplot2 without the help of the layout matrix

How do you arrange non-simmetric plots in ggplot2?
With the print command:

After installing these two packages: install.packages(“grid”, “ggplot2”), load the  libraries:

The data and code for the three graphs is taken from this website:

# create factors with value labels
mtcars$gear <- factor(mtcars$gear,levels=c(3,4,5), labels=c("3gears","4gears","5gears"))
mtcars$am <- factor(mtcars$am,levels=c(0,1), labels=c("Automatic","Manual"))
mtcars$cyl <- factor(mtcars$cyl,levels=c(4,6,8), labels=c("4cyl","6cyl","8cyl"))

# Kernel density plots for mpg
# grouped by number of gears (indicated by color)
a <- qplot(mpg, data=mtcars, geom="density", fill=gear, alpha=I(.5),
main="Distribution of Gas Milage", xlab="Miles Per Gallon",

# Scatterplot of mpg vs. hp for each combination of gears and cylinders
# in each facet, transmittion type is represented by shape and color
b <- qplot(hp, mpg, data=mtcars, shape=am, color=am,
facets=gear~cyl, size=I(3),
xlab="Horsepower", ylab="Miles per Gallon")

c <- qplot(gear, mpg, data=mtcars, geom=c("boxplot", "jitter"),
fill=gear, main="Mileage by Gear Number",
xlab="", ylab="Miles per Gallon")

a, b, and c are our graphs. Here we decide how to place the plots on the plotting surface:

grid.newpage() # Open a new page on grid device
pushViewport(viewport(layout = grid.layout(3, 1))) #this can really be anything... just remember to change accordingly the print commands below
print(a, vp = viewport(layout.pos.row = 1, layout.pos.col = 1:1))
print(b, vp = viewport(layout.pos.row = 2, layout.pos.col = 1:1))
print(c, vp = viewport(layout.pos.row = 3, layout.pos.col = 1:1))

The layout=grid.layout is the command dividing the plotting surface, in the example I have divided it into three rows and one column, hence the layout.pos.row = 1, 2, 3 and the layout.pos.row = 1:1 equal for all three plots.


What if I need something asymmetrical? For instance two small plots on one column and one taking up more space… The reasoning is very similar to that of the layout matrix: divide the space into 4 squares grid.layout(2, 2) and then plot the third graph over two rows layout.pos.row=1:2

grid.newpage() # Open a new page on grid device
pushViewport(viewport(layout = grid.layout(2, 2))) #this can really be anything... just remember to change accordingly the print commands below
print(a, vp = viewport(layout.pos.row = 1, layout.pos.col = 1:1))
print(b, vp = viewport(layout.pos.row = 2, layout.pos.col = 1:1))
print(c, vp = viewport(layout.pos.row = 1:2, layout.pos.col = 2:2))


How to get good maps in R and avoid the expensive softwares

How to convey as much information as possible in a clear and simple way? Producing maps for social sciences is not difficult, there are a plethora of softwares that can help us. But there are a few issues to consider when choosing your to go program:
(1) Do I want to do all my analysis in one (or more) program(s) and then switch to another one to make those maps?
(2) Are those programs freely accessible to me?
1. Using more than one softwares usually implies spending time to learn different syntax:  why do your analysis in (insert name here ____) and then plot in R when you can do everything in R?
2. The availability of mapping softwares is no trivial issue. Not all researchers have powerful computers, not all institutes have bottomless funds to buy licences, and sometimes having the possibility to map on your laptop while bingeing on Netflix is way nicer than waiting for the one computer with the one licence.

Probably the best and most elegant mapping tool available to Geographers is ArcGIS (to my knowledge, but again, I use R and own a Mac), however it does not come for free. What to do? Well, R is a very good alternative, you can produce elegant maps, customizable to the very last detail. The only drawback I have encountered is the time you would spend to get the first map, but then you would have the syntax and any other map would be pretty quick to plot, and you can always for loop all graphics (although I do not recommend it). Moreover, R runs on your Mac (and Linux), it allows for way more control over features, and has great color palettes (see here and here).

Here are some useful libraries:
library(maps) #for creating geographical maps
library(RColorBrewer) #contains color palettes
library(classInt) #defines the class intervals for the color palettes
library(maptools) #tools for handling spatial objects
library(raster) #tools to deal with raster maps
library(ggplot2) #to create maps, quick and painless

Some stuff to keep in mind:
(1) add a scale with scale.map (or a nice  scalebar);
(2) it is sometimes required to add a north arrow, you can find many versions for that (see this document on page 4 for  examples, I use the same with no labels);
(3) locator() is a very useful tool to get the coordinates when adding labels, arrows, scales and so on.

Part 1: get a plain map.

Below is a very simple example produced using EUROSTAT shape files for world countries (world) and DIVA-GIS for Spain at NUTS3 level (spain). In this map I have removed the Canary Islands, but you can always cut it and paste it in the map using either par(fig=c(…)) or par(fin(…)), inset, or something more elaborated with layout, and framing it using box() or rectangle.

world is the shapefile for the whole world, where I select the neighboring countries I want to appear in the map, in this case Spain, France, Portugal, France, Morocco, and Algeria.
spain is the Spain NUTS3 shapefile where I remove the Canary Islands (45)

plot(spain[-c(45),], border=F) #this first line does not plot anything, it just centers my graph on Spain, the -c(45) removes the Canary Islands
plot(world[c(6, 67,74, 132, 177),], border="lightblue",add=T, col="beige") #plotting the countries appearing in the map
plot(spain[-c(45),], border="brown", lwd=0.2, add=T, col="lightblue") #plot spain, removing the Canary Islands
map.scale(3,35.81324, ratio=F, cex=0.7, relwidth=0.1) # scale map
northarrow(c(4.8,42.9),0.7, cex=0.8)


Part 2: Add labels

Using the function shadow text to avoid labels overlapping.

coords<- coordinates(spain) # get goordinates of the centroids, it's where you center your labels
# p.names is a data frame containing the coordinates and all the names of the provinces (remember to get rid of those you don't want to use if using only a selection). Usually you can find the names in the shapefile, but I didn't have them.
shadowtext(p.names[,1],p.names[,2], label=paste(p.names[,3]), cex=0.7,col="black", bg="white",r=0.1)



Ubicación, ubicación, ubicación! ¿Por qué asuntos espaciales en la demografía y por qué debemos cuidar.

Me he dado cuenta solo ahora que mi post en Demotrends sobre la dimension espacial de los fenomenos demograficos ha sido traducido en español por el grupo “Población y Desarrollo en Honduras”, muchas gracias! Aquí esta:

Los fenómenos demográficos son inherentemente espaciales, así como las poblaciones humanas no se encuentran al azar en los patrones espaciales y liquidación dependen de atributos geográficos estructurales. En este contexto, el análisis espacial se centra en el papel del espacio en la explicación del fenómeno que se investiga, ejemplificada por la Primera Ley de la Geografía de Tobler : “todo está relacionado con todo, y los lugares más que cerca están más relacionados de lugares lejanos” (Tobler, 1970). La dimensión espacial de los fenómenos demográficos ha demostrado ser de gran importancia en la comprensión del papel de las características personales y el impacto del medio ambiente en este tipo de atributos. Sin embargo, la mayoría de los estudios tienden a ignorar esta dependencia espacial. Por ejemplo, si tenemos en cuenta el nivel de la tasa global de fecundidad (TGF), podemos decir que la TGF se autocorrelaciona espacialmente, es decir grupos de áreas muestran algún grado de dependencia, con valores similares para las zonas vecinas. Este es un punto importante, ya que la presencia de autocorrelación espacial puede sugerir la existencia de variables no observadas o no incluidas en el modelo.

Recordando la Primera Ley de la Geografía de Tobler, relaciones de distancia y vecinos entre diferentes áreas pueden ser particularmente importantes para comprender hasta qué punto es la dependencia espacial que existe y para entender “cómo establecer relaciones de vecindad” con el fin de estar relacionado, o espacialmente autocorrelacionados. De los diversos instrumentos utilizados en econometría espacial para comprender la dependencia espacial, índice I de Moran (Moran, 1950) es una de las estadísticas más utilizadas, ya que ayuda a cuantificar el nivel global de autocorrelación y discernir si se trata de un fenómeno aleatorio. (Gráfico 1) Sin embargo, el I de Moran no nos dice la “historia total”, y tenemos que complementarlo con otras herramientas como (semi) variograma, correlograma o análisis de variograma, que se refieren a la dependencia espacial a distancia por medio de covarianza, correlación y semivarianza a través de valores diferenciales observados entre vecinos ( Griffith y Paelinck, 2011: capítulo 3 ) y las medidas locales de asociación espacial, tales como I de Moran a nivel local para evaluar la agrupación y el significado de cada unidad espacial.

Obras recientes en el campo de la demografía espacial han evidenciado que la adición de la dimensión tiempo para el análisis espacial puede proporcionar información sobre la adopción de un nuevo régimen demográfico y cómo sus variables constitutivas son impactados a través del tiempo. Esta es una cuestión importante, ya que nos enteramos del proyecto de Princeton que la dimensión espacial es crucial para entender los procesos de difusión durante la primera transición demográfica en Europa ( Coale y Watkins, 1986 ). Sin embargo en la mayoría de los estudios de la Segunda Transición Demográfica, el componente espacial es a menudo pasado por alto. Esto es en parte debido a la disponibilidad de datos y también porque las transiciones demográficas son considerados como el resultado de un país procesos específicos. Pero centrarse en el nivel nacional en vez de la local al analizar los cambios en el régimen demográfico, por lo general pierden precursores, así como los rezagados. Un ejemplo clásico en España es la región de Cataluña, que fue un precursor de la Primera y la Segunda transiciones demográficas en comparación con el resto del país y de las regiones del Sur específicamente. Mapa 1. clustrs significativas para el índice de Princeton, 1981Mapa 2. agrupaciones significativas para el índice de Princeton, 2011

La forma más sencilla y práctica de la comprensión de cómo la dependencia espacial ha evolucionado a través del tiempo es por medio de las estadísticas locales de asociación espacial, en el que probar si y donde existen grupos de áreas con características similares. Anselin (1995) sugirió que los indicadores locales de asociación espacial , LISA, una técnica similar a la I de Moran, pero computarizada y evaluado para cada unidad espacial, comparable a una regresión lineal entre la variable medida en una cierta ubicación y la misma magnitud de medida en cada ubicación.

Por lo tanto, es muy fácil de ver cómo espacial autocorrelación puede alterar el resultado de un estudio que no toma en cuenta el espacio, por lo tanto, el viejo adagio de la propiedad, “ubicación, ubicación, ubicación!” También se puede aplicar también a la demografía. En el contexto de la heterogeneidad espacial de la fertilidad, España es un país único en Europa, con una larga y bien documentada de la diversidad regional y provincial por más de dos siglos. Tener una mirada a los cambios de fertilidad municipales más de las tres últimas décadas puede ser muy indicativo de cómo 1. La fertilidad disminuye con diferentes trayectorias, 2. La reciente recuperación de la fecundidad ha interesado sólo determinadas zonas hasta el inicio de la reciente crisis económica, 3. Migración ha afectado profundamente los patrones de fecundidad en las grandes ciudades, pero dejó otras regiones afectados. En cuanto a las medidas globales de autocorrelación (ver Gráfico 1), podemos entender por qué la dependencia espacial es un fenómeno que evoluciona el tiempo que puede cambiar y revertir su camino. Por ejemplo, el gráfico 1 muestra cómo en tiempos de expansión económica -y Fertilidad, autocorrelación espacial alcanza su pico, mientras que en tiempos de recesión económica -y la fertilidad declinación- que cae en picado, estabilizándose gran parte de las diferencias de fertilidad entre las regiones. Esto se debe principalmente a la forma en que las personas tratan de hacer frente a veces en dificultades al retrasar los nacimientos hasta que vengan tiempos mejores. Los dos mapas LISA grupo de clúster de la variable de interés, en este caso de Princeton Índice, utilizando una estadística local de cuatro grupos divididos como: rojo alta altos cúmulos de áreas con -relativamente- alta fertilidad rodeadas de alta fertilidad, azul oscuro bajo-bajo clusters, la luz roja de alta bajo racimos de altas áreas de fertilidad rodeadas de baja fertilidad y de color azul claro bajo altos cúmulos. La tradicional división de España en la alta fertilidad del Sur y la baja fertilidad del Norte se ha desplazado desde mediados de los años 90 en una división Este-Oeste con grandes ciudades como puntos calientes de la alta fecundidad, como se muestra en los mapas de la LISA 1 y 2.

Aunque las técnicas espaciales en la demografía a menudo se aplican a áreas pequeñas, el enfoque a gran escala puede abordar grandes cuestiones cuando un método más heurística falla. Mapeo puede ser una poderosa herramienta para entender la dinámica geográfica, pero sin necesidad de herramientas econométricas, temas tan importantes como la aleatoriedad y la significación estadística puede sesgar sustancialmente nuestros resultados. Por otra parte, la recolección de datos SIG se está volviendo más y más común en la demografía y en el espacio definitivamente arrojar nueva luz sobre los fenómenos demográficos.

Blogpost on Demotrends: Location, location, location! Why space matters in demography and why we should care. https://demotrends.wordpress.com/2014/11/06/location-location-location/

Mind the gap: the compass of foregone fertility in Europe

On Demotrends you can find some of the main findings from my collaboration with Daniel Devolder, during my stay at CED in Barcelona. Enjoy!


Simulation results showing the percentage of realized and simulated total fertility with respect to desired fertility.