## Baby names library(babynames) library(dplyr) library(ggplot2) babies <- babynames head(babies) tail(babies) dim(babies) ######################################### ######################################### ############ Compare 3 names ############ ######################################### ######################################### ## Choose "firstchoice" name name1 <- "Bradley" firstchoice <- babies %>% filter(name==name1) # Plot of name popularity (proportion of kids) ggplot(firstchoice, aes(x=year, y=prop, group=sex, color=sex)) + geom_line(size=2) + ggtitle(name1) # Choose "secondchoice" name name2 <- "Trinity" secondchoice <- babies %>% filter(name==name2) # Plot of name popularity (proportion of kids) ggplot(secondchoice, aes(x=year, y=prop, group=sex, color=sex)) + geom_line(size=2) + ggtitle(name2) # Choose "thirdchoice" name name3 <- "Elsa" thirdchoice <- babies %>% filter(name==name3) # Plot of name popularity (proportion of kids) ggplot(thirdchoice, aes(x=year, y=prop, group=sex, color=sex)) + geom_line(size=2) + ggtitle(name3) # Choose "fourthchoice" name name4 <- "Brad" fourthchoice <- babies %>% filter(name==name4) # Compare first vs second vs third vs fourth comparison <- subset(babies, name == name1 & sex=="M" | name == name2 & sex=="F" | name == name3 & sex=="F" | name == name4 & sex=="M") # Quick line plot of name popularity ggplot(comparison, aes(x=year, y=prop, group=name, color=name)) + geom_line(size=2) # Results firstchoice %>% filter(year==2012) %>% summarize(names = sum(n), prop=sum(prop)) secondchoice %>% filter(year==2012) %>% summarize(names = sum(n), prop=sum(prop)) thirdchoice %>% filter(year==2012) %>% summarize(names = sum(n), prop=sum(prop)) fourthchoice %>% filter(year==2012) %>% summarize(names = sum(n), prop=sum(prop)) ######################################### ######################################### ########### # of unique names ########### ######################################### ######################################### ## Are there more unique names now? babies %>% group_by(year, sex) %>% summarize(number_of_names = n_distinct(name)) %>% ggplot(aes(x=year, y=number_of_names, group=sex, color=sex)) + geom_line(size=2) + ggtitle("Number of unique names") ######################################### ######################################### ######### First or last letters ######### ######################################### ######################################### ## Get first and last letters of each name babies$firstletter <- substr(babies$name, 1, 1) babies$lastletter <- substr(babies$name, (nchar(babies$name)+1)-1, nchar(babies$name)) # Show random sample of data babies[sample(nrow(babies), 10), ] ## Popularity of first letter (males) # Replace the uppercase letters firstchoice1 <- "Z" firstchoice2 <- "X" firstchoice3 <- "Q" firstchoice4 <- "Y" # Plot comparison firstmale <- babies %>% filter(sex=="M") %>% filter(firstletter==firstchoice1 | firstletter==firstchoice2 | firstletter==firstchoice3 | firstletter==firstchoice4) %>% group_by(year, firstletter) %>% summarize(sum=sum(n)) ggplot(firstmale, aes(x=year, y=sum, group=firstletter, color=firstletter)) + geom_line(size=1) ## Popularity of first letter (females) # Replace the uppercase letters firstchoice1 <- "Z" firstchoice2 <- "X" firstchoice3 <- "Q" firstchoice4 <- "Y" # Plot comparison firstfemale <- babies %>% filter(sex=="F") %>% filter(firstletter==firstchoice1 | firstletter==firstchoice2 | firstletter==firstchoice3 | firstletter==firstchoice4) %>% group_by(year, firstletter) %>% summarize(sum=sum(n)) ggplot(firstfemale, aes(x=year, y=sum, group=firstletter, color=firstletter)) + geom_line(size=1) ## Popularity of last letter (males) # Replace the lowercase letters lastchoice1 <- "z" lastchoice2 <- "x" lastchoice3 <- "q" lastchoice4 <- "f" # Plot comparison lastmale <- babies %>% filter(sex=="M") %>% filter(lastletter==lastchoice1 | lastletter==lastchoice2 | lastletter==lastchoice3 | lastletter==lastchoice4) %>% group_by(year, lastletter) %>% summarize(sum=sum(n)) ggplot(lastmale, aes(x=year, y=sum, group=lastletter, color=lastletter)) + geom_line(size=1) ## Popularity of last letter (females) # Replace the lowercase letters lastchoice1 <- "z" lastchoice2 <- "x" lastchoice3 <- "q" lastchoice4 <- "f" # Plot comparison lastfemale <- babies %>% filter(sex=="F") %>% filter(lastletter==lastchoice1 | lastletter==lastchoice2 | lastletter==lastchoice3 | lastletter==lastchoice4) %>% group_by(year, lastletter) %>% summarize(sum=sum(n)) ggplot(lastfemale, aes(x=year, y=sum, group=lastletter, color=lastletter)) + geom_line(size=1) ######################################### ######################################### ########### Most popular name ########### ######################################### ######################################### # Pick the most popular name (boy or girl, but pick gender) namechoice1 <- "Quin" genderchoice1 <- "M" namechoice2 <- "Quinn" genderchoice2 <- "F" namechoice3 <- "Queen" genderchoice3 <- "F" namechoice4 <- "Queenie" genderchoice4 <- "F" namechoice5 <- "Quentin" genderchoice5 <- "M" namechoice6 <- "Quincy" genderchoice6 <- "M" namechoices <- subset(babies, name == namechoice1 & sex== genderchoice1 | name == namechoice2 & sex== genderchoice2 | name == namechoice3 & sex== genderchoice3 | name == namechoice4 & sex== genderchoice4 | name == namechoice5 & sex== genderchoice5 | name == namechoice6 & sex== genderchoice6) # Plot proportion of people ggplot(namechoices, aes(x=year, y=prop, group=name, color=name)) + geom_line(size=1)