BAR CHART + LINE: a ggplot balance plot (1)

You can download session 9 files for constructing the population pyramids of Georgia here: https://github.com/rladies/meetup-presentations_tbilisi and specify your working directory with setwd(“/Users/mydomain/myforlder/)

BAR CHART + LINE:

###Graph 1: Total services trade, by value

Source: Georgia 2013

 require(ggplot2)
require(dplyr)
mypath <- "/Users/StayPuftMarshmallowMan/Shandor Folder/"
setwd(paste(mypath))
mydt <- read.csv("Session_2_Georgia_Data_UN.csv", header=T)

head(mydt)
##                                            variable     type year   value
## 1 GDP: Gross domestic product (million current US$) economic 2014 16530.0
## 2 GDP: Gross domestic product (million current US$) economic 2010 11638.0
## 3 GDP: Gross domestic product (million current US$) economic 2005  6411.0
## 4    GDP growth rate (annual %, const. 2005 prices) economic 2014     4.8
## 5    GDP growth rate (annual %, const. 2005 prices) economic 2010     6.2
## 6    GDP growth rate (annual %, const. 2005 prices) economic 2005     9.6
##   geo
## 1
## 2
## 3
## 4
## 5
## 6
levels(mydt$variable)
##  [1] "Agricultural production index (2004-2006=100)"
##  [2] "Balance (million US$)"
##  [3] "Balance of payments, current account (million US$)"
##  [4] "CO2 emission estimates (tons per capita)"
##  [5] "CPI: Consumer price index (2000=100)"
##  [6] "Economy: Agriculture (% of GVA)"
##  [7] "Economy: Industry (% of GVA)"
##  [8] "Economy: Services and other activity (% of GVA)"
##  [9] "Education: Government expenditure (% of GDP)"
## [10] "Education: Tertiary gross enrolment ratio (f-m per 100 pop.)"
[...]
## [48] "Unemployment (% of labour force)"
## [49] "Urban population (%)"
## [50] "Urban population growth rate (average annual %)"
ser.dt <- mydt %>%
filter(variable=="Total Services Trade")

Balance <- ser.dt%>%
group_by(year)%>%
summarise(value=-diff(value))

Balance <- cbind(variable=c(rep("Total Services Trade", 13)),
type= c(rep("Balance", 13)), Balance, geo=c(rep("NA", 13)))

mydata <- rbind(ser.dt, Balance)

subset with the pipe operator %>%

base <- mydata %>%
filter(type != "Balance") %>%
mutate(
value = ifelse(type == "Exports", value, -value)
)
balance <- mydata %>%
filter(type == "Balance")

ggplot(balance, aes(x = year, y = value)) +
geom_bar(data = base, aes(fill = type), stat = "identity") +
geom_point(aes(colour = type)) +
geom_line(aes(colour = type, group=1)) +
scale_fill_manual(values = c(Exports = "#D55E00", Imports = "#E69F00"), name="") +
scale_colour_manual(values = c(Balance = "red"), name="") +
labs(x = "", y = "Total Services Trade")+
theme_bw()

graph1

DONUT CHART in ggplot2

 DONUT CHART

I personally don’t like pie charts that much, I prefer donut charts, they take up less space and the center can be used for extra annotations. In ggplot2 to get the “Donut” you design a bar chart (geom_bar) and then just bend it (coord_polar) at the extremities to get a donut.

To reproduce the chart below, you can download the data from the RLadies Tbilisi github webpage, Session 9 on Plotting.

Alternatively here’s the dput(-ted) data:

structure(list(X = 1:3, variable = structure(c(1L, 1L, 1L), .Label = "Export of Services", class = "factor"), type = structure(c(3L, 2L, 1L), .Label = c("Remaining", "Transportation", "Travel"), class = "factor"), year = c(2012L, 2012L, 2012L ), value = c(55.5, 33.4, 11.1), geo = c(NA, NA, NA), pos = c(27.75, 72.2, 94.45)), .Names = c("X", "variable", "type", "year", "value", "geo", "pos"), class = "data.frame", row.names = c(NA, -3L))

Exports of services by EBOPS category

#set the working directory
setwd("/Users/DrVenkman/The Gatekeepers Folder/")

require(dplyr) #data manipulation
require(tidyr) #data manipulation, wide to long format
require(ggplot2) #ggplot package for plotting

exp.ser <- mydt %>%
filter(variable == "Export of Services")

exp.ser <- exp.ser %>% group_by(year) %>% mutate(pos = cumsum(value)- value/2)

p <- ggplot(exp.ser, aes(x=2, y=value, fill=type))+
geom_bar(stat="identity")+
geom_text( aes(label = value, y=pos), size=10, fontface="bold")+
xlim(0.5, 2.5) +
coord_polar(theta = "y")+
labs(x=NULL, y=NULL)+
labs(fill="") +
scale_fill_manual(values = c(Remaining = "blue", Transportation = "#E69F00", Travel= "#D55E00"), name="")+
ggtitle("Exports of services by EBOPS category, 2013")+
theme_bw()+
theme(plot.title = element_text(face="bold",family=c("sans"),size=15),
legend.text=element_text(size=10),
axis.ticks=element_blank(),
axis.text=element_blank(),
axis.title=element_blank(),
panel.grid=element_blank(),
panel.border=element_blank())

p

graph2

 giphy

High Resolution Mapping of Fertility and Mortality from Household Survey Data in Low Income Settings – PAA presentation

I will present at PAA my WorldPop mapping of Demographic indicators in low-income settings at PAA in Chicago.  “Advances in Mathematical, Spatial, and Small-Area Demography”, Thursday, April 27, 2017: 10:15 AM – 11:45 AM, Hilton, Joliet Room.

Violin plots in ggplot2

Use geom_violin() to quickly plot a visual summary of variables, using the Boston dataset, MASS library.

Use geom_violin() to quickly plot a visual summary of variables, using the Boston dataset from the MASS library.

1. Upload the relevant libraries:

require(tidyr)
require(ggplot2)
require(RColorBrewer)
require(randomcoloR)
require(MASS)

2. Load data and use the tidyr package to transform wide into long format:

data(Boston)
dt.long <- gather(Boston, "variable",
"value", crim:medv)

3. Create some color palettes:

col <- colorRampPalette(c("red", "blue"))(14)
# col.bp <- brewer.pal(9, "Set1") # brewer.pal only has a max of 9 colors
col.rc <- as.vector(distinctColorPalette(14))

4. Plot(s):

  • With the standard colors produced by ggplot2:
ggplot(dt.long,aes(factor(variable), value))+
geom_violin(aes(fill=factor(variable)))+
geom_boxplot(alpha=0.3, color="black", width=.1)+
labs(x = "", y = "")+
theme_bw()+
theme(legend.title = element_blank())+
facet_wrap(~variable, scales="free")

violin-ggplot-color

  • With the color palette produced by colorRampPalette:
ggplot(dt.long,aes(factor(variable), value))+
geom_violin(aes(fill=factor(variable)))+
geom_boxplot(alpha=0.3, color="black", width=.1)+
labs(x = "", y = "")+
scale_fill_manual(values = col, name="")+
theme_bw()+
facet_wrap(~variable, scales="free")

violin-auto-color

  • With the color palette produced by randomcoloR library:
ggplot(dt.long,aes(factor(variable), value))+
geom_violin(aes(fill=factor(variable)))+
geom_boxplot(alpha=0.3, color="black", width=.1)+
labs(x = "", y = "")+
scale_fill_manual(values = col.rc, name="")+
theme_bw()+
facet_wrap(~variable, scales="free")

violin-rc-color

Plot maps with base mapping tools and ggmap in R

Plot maps with ‘base’ mapping tools in R

Understanding what kind of data you have (polygons or points?) and what you want to map is pivotal to start your mapping.

  1. First you need a shapefile of the area you want to plot, such as metropolitan France. There are various resources where to get them from: DIVA-GIS and EUROSTAT are those that I use the most. It’s always important to have a .prj file included, as your final map ‘should’ be projecte. I say “should” as sometimes it is just not possible, especially if you work with historical maps.
  2. Upload libraries

Load and prepare data

setwd(paste(mypath))
fr.prj <- readOGR(".", "FRA_adm2")
## OGR data source with driver: ESRI Shapefile
## Source: ".", layer: "FRA_adm2"
## with 96 features
## It has 18 fields
## NOTE: rgdal::checkCRSArgs: no proj_defs.dat in PROJ.4 shared files
map(fr.prj)
rplot
## Warning in SpatialPolygons2map(database, namefield = namefield): database
## does not (uniquely) contain the field 'name'.

head(fr.prj@data)
##   ID_0 ISO NAME_0 ID_1    NAME_1  ID_2         NAME_2   VARNAME_2
## 0   76 FRA France  989    Alsace 13755       Bas-Rhin  Unterelsaá
## 1   76 FRA France  989    Alsace 13756      Haut-Rhin   Oberelsaá
## 2   76 FRA France  990 Aquitaine 13757       Dordogne        <NA>
## 3   76 FRA France  990 Aquitaine 13758        Gironde Bec-D'Ambes
## 4   76 FRA France  990 Aquitaine 13759         Landes      Landas
## 5   76 FRA France  990 Aquitaine 13760 Lot-Et-Garonne        <NA>
##   NL_NAME_2 HASC_2 CC_2      TYPE_2  ENGTYPE_2 VALIDFR_2 VALIDTO_2
## 0      <NA>  FR.BR <NA> Département Department  17900226   Unknown
## 1      <NA>  FR.HR <NA> Département Department  17900226   Unknown
## 2      <NA>  FR.DD <NA> Département Department  17900226   Unknown
## 3      <NA>  FR.GI <NA> Département Department  17900226   Unknown
## 4      <NA>  FR.LD <NA> Département Department  17900226   Unknown
## 5      <NA>  FR.LG <NA> Département Department  17900226   Unknown
##   REMARKS_2 Shape_Leng Shape_Area
## 0      <NA>   4.538735  0.5840273
## 1      <NA>   3.214178  0.4198797
## 2      <NA>   5.012795  1.0389622
## 3      <NA>   9.200047  1.1489822
## 4      <NA>   5.531231  1.0372815
## 5      <NA>   4.489830  0.6062017
# load or create data
set.seed(100)
myvar <- rnorm(1:96)
# manipulate data for the plot
france.geodata  <- data.frame(id=rownames(fr.prj@data), mapvariable=myvar)
head(france.geodata)
##   id mapvariable
## 1  0  1.12200636
## 2  1  0.05912043
## 3  2 -1.05873510
## 4  3 -1.31513865
## 5  4  0.32392954
## 6  5  0.09152878

Use ggmap

# fortify prepares the shape data for ggplot
france.dataframe <- fortify(fr.prj) # convert to data frame for ggplot
## Regions defined for each Polygons
head(france.dataframe)
##       long      lat order  hole piece id group
## 1 7.847912 49.04728     1 FALSE     1  0   0.1
## 2 7.844539 49.04495     2 FALSE     1  0   0.1
## 3 7.852439 49.04510     3 FALSE     1  0   0.1
## 4 7.854333 49.04419     4 FALSE     1  0   0.1
## 5 7.855955 49.04431     5 FALSE     1  0   0.1
## 6 7.856299 49.03776     6 FALSE     1  0   0.1
#now combine the values by id values in both dataframes
france.dat <- join(france.geodata, france.dataframe, by="id")
head(france.dat)
##   id mapvariable     long      lat order  hole piece group
## 1  0    1.122006 7.847912 49.04728     1 FALSE     1   0.1
## 2  0    1.122006 7.844539 49.04495     2 FALSE     1   0.1
## 3  0    1.122006 7.852439 49.04510     3 FALSE     1   0.1
## 4  0    1.122006 7.854333 49.04419     4 FALSE     1   0.1
## 5  0    1.122006 7.855955 49.04431     5 FALSE     1   0.1
## 6  0    1.122006 7.856299 49.03776     6 FALSE     1   0.1
# Plot 3
p <- ggplot(data=france.dat, aes(x=long, y=lat, group=group))
p <- p + geom_polygon(aes(fill=mapvariable)) +
       geom_path(color="white",size=0.1) +
       coord_equal() +
       scale_fill_gradient(low = "#ffffcc", high = "#ff4444") +
       labs(title="Our map",fill="My variable")
# plot the map
p

image-22-02-2017-at-12-11

Use plot basic

nclassint <- 5 #number of colors to be used in the palette
cat <- classIntervals(myvar, nclassint,style = "jenks") #style refers to how the breaks are created
colpal <- brewer.pal(nclassint,"RdBu")
color <- findColours(cat,rev(colpal)) #sequential
bins <- cat$brks
lb <- length(bins)
plot(fr.prj, col=color,border=T)
legend("bottomleft",fill=rev(colpal),legend=paste(round(bins[-length(bins)],1),":",round(bins[-1],1)),cex=1, bg="white")

image-22-02-2017-at-12-23-copy

Find color breaks for mapping (fast)

I’ve stumbled upon a little trick to compute jenks breaks faster than with the classInt package, just be sure to use n+1 instead of n as the breaks are computed a little bit differently. That is to say, if you want 5 breaks, n=6, no biggie there.

For more on the Bayesian Analysis of Macroevolutionary Mixtures see BAMMtools library

install.packages("BAMMtools")
library(BAMMtools)
system.time(getJenksBreaks(mydata$myvar, 6))
> user system elapsed
> 0.970 0.001 0.971

On the other hand this takes way more time with large datasets
library(classInt)
system.time(classIntervals(mydata$myvar, n=5, style="jenks"))
> Timing stopped at: 1081.894 1.345 1083.511

Upload files in R

Upload files from Excel, STATA, SAS, SPSS and text

First set the working directory (or check it)

getwd() # get working directory
 [1] "/Users/me/My Folder/"
 setwd("./My Subfolder/") # set working directory

1. .csv and .txt files

the read.csv function has many options, some of them are header=T which sets the first line as column names, sep=“,” the field separator character (in this case the semicolon), dec=“.” decimal sep character, skip=2 number of lines to skip (in this case 2).

read.csv2 is identical to read.csv except it assumes commas to be the decimal operators and semicolon as field separator

read.table works similarly to read.csv, but reads text files.

 mydata <- read.csv("mydata.csv", header=T)

When importing data in R, if any column’s name is a number, R will add an X to it (as in general it is a very bad idea to have numbers for column names, but can be handy). You can replace column names with:

 colnames(mydata) <- c("name1", "name2", "name3", "2017", "2018", "2019")

If you change or add anything to your data and want to save it then ( write.table for txt output):

 write.csv(mydata, "mydata.csv", row.names=FALSE)

2. STATA files .dta

 library(foreign)
 write.dta(mydata, "mydata.dta")

3. SPSS files .sav

use.value.labels by default is TRUE and converts value labels into factors. The mydata.txt is the name for data output, while the mydata.sps is the code output.

library(foreign)
 mydata <- read.spss("mydata", to.data.frame=T, use.value.labels = FALSE)
 write.foreign(as.data.frame(mydata), "mydata.txt", "mydata.sps", package="SPSS")

4. SAS files .sas

Note that by default it converts value labels into factors

## to read from SAS
 library(Hmisc)
 mydata <- sasxport.get("mydata.xpt")

## to save in SAS format
 library(foreign)
 write.foreign(as.data.frame(mydata), "mydata.txt", "mydata.sas", package="SAS")

5. Excel spreadsheet

# library(xlsx)
 mydata <- read.xlsx("c:/myexcel.xlsx", 1) # 1 refers to the first worksheet-page altrenatively...
 mydata <- read.xlsx("c:/myexcel.xlsx", sheetName="Data input")
 write.xlsx(mydata, "mydata.xlsx")
# library(readxl)
mydata <-system.file("mypath/myexcel.xlsx", package = "readxl")
mydata <- read_excel(mydata, 1)

(A few) quick tricks

# head(mydata, n=10) # first 10 rows
 tail(mydata, n=10) # last 10 rows
 mydata[1,1:10] # print first row and first 10 columns
 names(mydata) # variable names
 length(mydata)
 nrow(mydata) # number of rows
 ncol(mydata) # number of columns
 str(mydata) # list structure of data
 class(mydata) # class of data
 view(mydata) # opens viewer window