Tutorial 8 Answers

Problem Set 8: Questions and Answers

  1. In my example of DC population over time in section B.1., I present the graph of three steps. Modify your code to make these same three steps.


To break the graph into three parts, you need to make sure that you keep the x axis the same for each step so the graph looks like you are just adding a few years.

# load libraries
# load data
counties <- read.csv("h:/pppa_data_viz/2019/tutorial_data/lecture08/counties_1910to2010_20180116.csv")

Now just limit the data to DC. You could do this in the ggplot call itself. However, in this case when we are only planning to use DC, this gives us a smaller dataset to work with and that speeds processing. This will also make the coding easier, since we won’t have to subset in each graph.

Take a look at the data after we subset to DC. Does it have the right number of observations?

# load data
counties <- read.csv("h:/pppa_data_viz/2019/tutorial_data/lecture08/counties_1910to2010_20180116.csv")

# get just dc
dct <- counties[which(counties$statefips == 11),]
# set size for on-graph text
on.g.text.size <- 4

# first graph, just through 1950
done2a <- 
  ggplot(dct[which(dct$year <= 1950),]) +
  geom_line(mapping = aes(x=year, y=cv1), size=1.5) +
  geom_point(mapping = aes(x=year, y=cv1), size=3) +
  scale_y_continuous(labels = comma,  
                     limits = c(0, 825000), 
                     breaks = c(seq(0,800000,200000))) +
  scale_x_continuous(limits= c(1910, 2010), 
                     breaks = c(seq(1910,2010,20))) +
  labs(x="", y="") +
  theme(panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        panel.grid.major.y = element_line(color="gray"),
        legend.position = "none",
        axis.line.x = element_line(color = "black"),
        axis.ticks.x = element_blank(), 
        axis.ticks.y = element_blank(),
        axis.text = element_text(size = 15)) 

# then through 1980 
done2b <- 
  ggplot(dct[which(dct$year <= 1980),]) +
  geom_line(mapping = aes(x=year, y=cv1), size=1.5) +
  geom_point(mapping = aes(x=year, y=cv1), size=3) +
  scale_y_continuous(labels = comma,  
                     limits = c(0, 825000), 
                     breaks = c(seq(0,800000,200000))) +
  scale_x_continuous(limits= c(1910, 2010), 
                     breaks = c(seq(1910,2010,20))) +
  labs(x="", y="") +
  theme(panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        panel.grid.major.y = element_line(color="gray"),
        legend.position = "none",
        axis.line.x = element_line(color = "black"),
        axis.ticks.x = element_blank(), 
        axis.ticks.y = element_blank(),
        axis.text = element_text(size = 15)) +
  annotate(geom = "segment", x=1968, y=0, xend=1968, yend=450000, color = "#2b8cbe") +
  annotate(geom = "segment", x=1968, y=550000, xend=1968, yend=825000, color = "#2b8cbe") +
  annotate(geom = "segment", x=1954, y=0, xend=1954, yend=460000, color = "#74a9cf") +
  annotate(geom = "segment", x=1954, y=610000, xend=1954, yend=825000, color = "#74a9cf") +
  annotate(geom = "text", x=1955, y=575000, label="1954:", color = "#74a9cf", 
           size=on.g.text.size, hjust=1) +
  annotate(geom = "text", x=1955, y=535000, label="School", color = "#74a9cf", 
           size=on.g.text.size, hjust=1) +
  annotate(geom = "text", x=1955, y=495000, label="Desegregation", color = "#74a9cf", 
           size=on.g.text.size, hjust=1) +
  annotate(geom = "text", x=1967, y=525000, label="1968:", color = "#2b8cbe", 
           size=on.g.text.size, hjust=0) +
  annotate(geom = "text", x=1967, y=475000, label="Civil Disturbance", color = "#2b8cbe", 
           size=on.g.text.size, hjust=0) 

# and then the whole thing
done2c <- 
  ggplot(dct) +
  geom_line(dct, mapping = aes(x=year, y=cv1), size=1.5) +
  geom_point(dct, mapping = aes(x=year, y=cv1), size=3) +
  scale_y_continuous(labels = comma,  
                     limits = c(0, 825000), 
                     breaks = c(seq(0,800000,200000))) +
  scale_x_continuous(limits= c(1910, 2010), 
                     breaks = c(seq(1910,2010,20))) +
  labs(x="", y="") +
  theme(panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        panel.background = element_blank(), 
        panel.grid.major.y = element_line(color="gray"),
        legend.position = "none",
        axis.line.x = element_line(color = "black"),
        axis.ticks.x = element_blank(), 
        axis.ticks.y = element_blank(),
        axis.text = element_text(size = 15)) +
  annotate(geom = "segment", x=1995, y=0, xend=1995, yend=220000, color="#045a8d") +
  annotate(geom = "segment", x=1995, y=360000, xend=1995, yend=825000, color="#045a8d") +
  annotate(geom = "segment", x=1968, y=0, xend=1968, yend=450000, color = "#2b8cbe") +
  annotate(geom = "segment", x=1968, y=550000, xend=1968, yend=825000, color = "#2b8cbe") +
  annotate(geom = "segment", x=1954, y=0, xend=1954, yend=460000, color = "#74a9cf") +
  annotate(geom = "segment", x=1954, y=610000, xend=1954, yend=825000, color = "#74a9cf") +
  annotate(geom = "text", x=1955, y=575000, label="1954:", color = "#74a9cf", 
           size=on.g.text.size, hjust=1) +
  annotate(geom = "text", x=1955, y=535000, label="School", color = "#74a9cf", 
           size=on.g.text.size, hjust=1) +
  annotate(geom = "text", x=1955, y=495000, label="Desegregation", color = "#74a9cf", 
           size=on.g.text.size, hjust=1) +
  annotate(geom = "text", x=1967, y=525000, label="1968:", color = "#2b8cbe", 
           size=on.g.text.size, hjust=0) +
  annotate(geom = "text", x=1967, y=475000, label="Civil Disturbance", color = "#2b8cbe", 
           size=on.g.text.size, hjust=0) +
  annotate(geom = "text", x=1994, y=325000, label="1995:", color="#045a8d", 
           size=on.g.text.size, hjust=0) +
  annotate(geom = "text", x=1994, y=285000, label="Control Board", color="#045a8d", 
           size=on.g.text.size, hjust=0) +
  annotate(geom = "text", x=1994, y=245000, label="Takes Power", color="#045a8d", 
           size=on.g.text.size, hjust=0) 




  1. Using the bikeshare data,
  1. Re-do one of the by-hour pictures as a minute-by-minute picture showing total ridership
  2. Use one of the y variables we used or an alternative one. Add some annotations to your graph to point out salient features.


# load data 
cabi.201901 <- read.csv("H:/pppa_data_viz/2019/tutorial_data/lecture08/201902-capitalbikeshare-tripdata/201902-capitalbikeshare-tripdata.csv")

# check out variables
# preapre time variables
cabi.201901$time.start <- as.POSIXct(strptime(x = cabi.201901$Start.date, 
                                              format = "%Y-%m-%d %H:%M:%S"))
cabi.201901$time.stop  <- as.POSIXct(strptime(x = cabi.201901$End.date, 
                                              format = "%Y-%m-%d %H:%M:%S"))

# my duration calculation
cabi.201901$my.duration <- cabi.201901$time.stop - cabi.201901$time.start
cabi.201901$Duration.minutes <- cabi.201901$Duration / 60
# make an indicator for a member 
cabi.201901$member <- ifelse(cabi.201901$Member.type == "Member", 1, 0)

# summarize to minute data
cabi.201901 <- group_by(cabi.201901, start.minute)
cabisum <- summarize(.data = cabi.201901, no_rides = n(), 
                                          mean_dur = mean(Duration),
                                          member_rides = sum(member))
[1] 60  4
# find member share of rides
cabisum$member.share <- cabisum$member_rides / cabisum$no_rides

# number of rides by minute
c3 <- ggplot() +
  geom_line(data = cabisum, mapping = aes(x = start.minute, y = no_rides)) +
  labs(title = "Total number of rides by minute") +

# member share of rides by minute
c4 <- ggplot() +
  geom_line(data = cabisum, mapping = aes(x = start.minute, y = member.share)) +
  labs(title = "Total number of rides by minute") +

  1. More stacked areas

Now you try to load your own budget data!

Use Table 1.3 (his01z3.xls), from which we want the year and columns E, F, G and columns I, J and K. Create a new excel document with just this information, and make one row at top with names that you’ll understand. Keep just through 2017, and make sure that you don’t have any junk at the bottom of the table. Save this file as csv (file, save as, choose “csv” option for file type).

Load it into R and make a stacked area graph of receipts, outlays and deficits over time.

Having done this myself, here are a few suggestions

  • make long data, as we did above
  • make year numeric, as we did for the social insurance revenue above
  • get rid of commas in the data. My command to do this, for one variable, is
hist02z3$b1 <- as.numeric(gsub(",", "", hist02z3$cd.receipts, fixed = TRUE))


### receipts/surplus/deficits constant dollars #### 
hist01z3 <- read.csv("H:/pppa_data_viz/2018/tutorials/lecture05/omb_data/hist01z3.csv")

[1] "year"        "cd.receipts" "cd.outlays"  "cd.surplus"  "pg.receipts"
[6] "pg.outlays"  "pg.surplus" 
### need to make this long

# clean up variables for reshape
hist01z3$nyear <- as.numeric(levels(hist01z3$year))[hist01z3$year]
# rename and make numeric for reshape
# warning: you also need to get rid of commas in the numbers
# see http://rfunction.com/archives/2354
hist01z3$b1 <- as.numeric(gsub(",", "", hist01z3$cd.receipts, fixed = TRUE))
hist01z3$b2 <- as.numeric(gsub(",", "", hist01z3$cd.outlays, fixed = TRUE))
hist01z3$b3 <- as.numeric(gsub(",", "", hist01z3$cd.surplus, fixed = TRUE))
# make a negative outlays for a more interesting chart
hist01z3$b4 <- hist01z3$b2 * -1

# just keep the variabls we make long
hist2 <- hist01z3[,c("year","b1","b2","b3","b4")]

# reshape to long
b.long <- pivot_longer(data = hist2, 
                       cols = c("b1", "b2","b3","b4"),
                       names_to = "btype",
                       values_to = "nyear")
# give names for types
# make a type of receipts variable
b.long$bname <- ifelse(b.long$btype == "b1","receipts",
                       ifelse(b.long$btype == "b2", "outlays",
                              ifelse(b.long$btype == "b3", "surplus",
                                     ifelse(b.long$btype == "b4","outlays",""))))

# make name factor for ease of use
b.long$bname.fac <- as.factor(b.long$bname)

# check output
sub.long <- b.long[which(b.long$btype %in% c("year","nyear","b1","b4","b3","bname.fac")),]

# make year numeric
sub.long$year <- as.numeric(sub.long$year)
### up and down chart of receipts/surplus/deficits ###
#### stacked chart of total receipts by type ###
## without factor() this doesnt work
hw5q2 <- 
  ggplot() +
  geom_area(data = sub.long, 
            mapping = aes(x=year, y=nyear, 
                          group = bname.fac, fill=bname.fac)) +
  scale_fill_manual(values = c("red", "black","grey")) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        panel.background = element_blank(), panel.grid.major.y = element_line(color="gray"),
        axis.ticks.x = element_blank(), axis.ticks.y = element_blank(),
        legend.position = "none") +
  annotate("text", x=1960, y=400, label="receipts", color = "white") +
  annotate("text", x=1960, y=-350, label="outlays", color = "white") +
  annotate("text", x=2012.2, y=-130, label="difference", color = "black", size = 4.5)

