443-970-2353
[email protected]
CV Resume
In this post, we will use the rvest web scraping R package to scrape US population data from Wikipedia and use ggplo2 to visualize the population data by state.
library(rvest)
library(dplyr)
library(calibrate)
library(stringi)
library(ggplot2)
library(maps)
library(ggmap)
wiki= read_html("https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population")
states=wiki %>%
html_nodes("table") %>%
.[[1]]%>%
html_table(fill=T)
head(states)
Now, we can use the stringi package to fix some minor problems
states[,1]=stri_sub(states[,1],22)
states[,2]=stri_sub(states[,2],22)
states[,7]=stri_sub(states[,7],22)
states[,3]=stri_sub(states[,3],3)
states[,12]=stri_sub(states[,12],3)
head(states)
for(i in 4:10){
states[,i] = gsub(",","",states[,i])
}
head(states)
Let's make sure the column names are appropriate column names.
names(states)= make.names(names(states))
names(states)
statesMap = map_data("state")
str(statesMap)
states$region = tolower(states$State.or.territory)
statesMap = merge(statesMap, states, by="region",all.x=T)
str(statesMap)
x=c(10,11,12,14,15,16)
for (i in x){
statesMap[,i]=as.numeric(statesMap[,i])
}
statesMap = statesMap[order(statesMap$group, statesMap$order),]
ggplot(statesMap, aes(x = long, y = lat, group = group, fill = Census.population..April.1..2010)) +
geom_polygon(color = "black") + scale_fill_gradient(name = "Population 2010",low = "#B8E6E6", high = "darkblue", guide = "colorbar",na.value="white")
ggplot(statesMap, aes(x = long, y = lat, group = group, fill = Population.estimate.for.July.1..2014)) +
geom_polygon(color = "black") + scale_fill_gradient(name = "Population 2014",low = "#E6E6B8", high = "#1A4C1A", guide = "colorbar",na.value="white")
ggplot(statesMap, aes(x = long, y = lat, group = group, fill = (Population.estimate.for.July.1..2014/Census.population..April.1..2000-1)*100)) +
geom_polygon(color = "black") + scale_fill_gradient(name = "% change of census population \n between 2000 and 2014",low = "white", high = "red", guide = "colorbar",na.value="white")
ggplot(statesMap, aes(x = long, y = lat, group = group, fill = X2010.Census.pop..per.House.seat.4.)) +
geom_polygon(color = "black") + scale_fill_gradient(name = "Census population \n per house seat 2010",low = "white", high = "black", guide = "colorbar",na.value="white")