--- title: "MATH 216 Homework 3" author: "WRITE YOUR NAME HERE" output: html_document: toc: true toc_float: true collapsed: false smooth_scroll: false --- ```{r, echo=FALSE, message=FALSE} library(ggplot2) library(dplyr) library(readr) library(lubridate) library(Quandl) library(knitr) ``` ## Question 1: Who | Variables | Threshold | Test % Correct ------------- | ------------- | ------------- | ------------- Alden | `job2 + height + diet3 + income2` | 0.5 | 84.1 Amanda | `income_level + job_new + body_type_buckets` | 0.5 | 70.7 Brenda | `income+height` | 0.5 | 83.2 James | `income + job + orientation + body_type` | 0.5 | 71.4 Albert | `height`| 0.5 | 83.0 Recall from `Lec12.R` and quiz ```{r, echo=FALSE, message=FALSE, warning=FALSE, fig.width=8, fig.height=4.5} profiles <- read_csv(file="profiles.csv") %>% filter(!is.na(height)) %>% mutate(is_female=ifelse(sex=='f', 1, 0)) %>% tibble::rownames_to_column(var="id") profiles_clean <- profiles %>% filter(!is.na(height)) %>% filter(between(height, 55, 80)) prop_female_height <- profiles_clean %>% select(is_female, height) %>% group_by(height) %>% summarise(prop_female = mean(is_female)) # Plot ggplot(data=prop_female_height, aes(height, prop_female)) + geom_line() + geom_hline(yintercept = 0.5, col="red") + labs(y="Proportion Female") ``` ## Question 2: **Important Principle of Coding**: [Don't Repeat Yourself](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) ```{r, echo=TRUE, message=FALSE, warning=FALSE, fig.width=8, fig.height=4.5} # 1. Get gold & bitcoin data # 2. Make column structure the same # 3. Add variable type gold <- Quandl("BUNDESBANK/BBK01_WT5511") %>% select(Date, Value) %>% mutate(type="Gold") bitcoin <- Quandl("BAVERAGE/USD") %>% rename(Value = `24h Average`) %>% select(Date, Value) %>% mutate(type="Bitcoin") # Combine them into single data frame using bind_rows() combined <- bind_rows(gold, bitcoin) %>% # Group by here! group_by(type) %>% # Then do the following ONLY ONCE: filter(year(Date) >= 2011) %>% arrange(Date) %>% mutate( Value_yest = lag(Value), rel_diff = 100 * (Value-Value_yest)/Value_yest ) # Plot ggplot(combined, aes(x=Date, y=rel_diff, col=type)) + geom_line() + labs(y="% Change") ``` ## Question 3: ```{r, echo=FALSE, message=FALSE, cache=TRUE} # Edit this code block at your own peril! cache is set to TRUE! jukebox <- read_csv(file="reed_jukebox.csv") # Clean certain artists' names: sigur_ros <- "Sigur Ro\xfc\xbe\x99\x86\x94\xbc\xfc\xbe\x8c\x93\xa0\xbcs" bjork <- "Bjo\xfc\xbe\x99\x86\x94\xbc\xfc\xbe\x8d\xa6\x98\xbcrk" blue_oyster_cult <- "Blue O\xfc\xbe\x99\x86\x94\xbc\xfc\xbe\x8d\xa6\x98\xbcyster Cult" husker_do <- "Hu\xfc\xbe\x99\x86\x94\xbc\xfc\xbe\x8d\xa6\x98\xbcsker Du\xfc\xbe\x99\x86\x94\xbc\xfc\xbe\x8d\xa6\x98\xbc" bjork_brodsky <- "Bjo\xfc\xbe\x99\x86\x94\xbc\xfc\xbe\x8d\xa6\x98\xbcrk & Brodsky Quartet" slagsmalsklubben <- "Slagsma\xfc\xbe\x99\x86\x94\xbc_lsklubben " bjork_sugarcubes <- "Bjo\xfc\xbe\x99\x86\x94\xbc\xfc\xbe\x8d\xa6\x98\xbcrk (Sugarcubes)" jukebox <- jukebox %>% mutate( artist = ifelse(artist == sigur_ros, "Sigor Ros", artist), artist = ifelse(artist == bjork, "Bjork", artist), artist = ifelse(artist == blue_oyster_cult, "Blue Oyster Cult", artist), artist = ifelse(artist == husker_do, "Husker Do", artist), artist = ifelse(artist == bjork_brodsky, "Bjork & Brodsky Quartet", artist), artist = ifelse(artist == slagsmalsklubben, "Slagsmalsklubben", artist), artist = ifelse(artist == bjork_sugarcubes, "Bjork (Sugarcubes)", artist) ) ``` When parsing the time, what most of you did: ```{r, echo=TRUE, message=TRUE, warning=TRUE, fig.width=8, fig.height=4.5} jukebox_hourly <- jukebox %>% mutate( date_time = parse_date_time(date_time, "%b %d %H%M%S %Y"), hour=hour(date_time) ) %>% group_by(hour) %>% summarise(count=n()) ``` What's wrong with this plot? ```{r, echo=TRUE, message=TRUE, warning=TRUE, fig.width=8, fig.height=4.5} ggplot(data=jukebox_hourly, aes(x=hour, y=count)) + geom_bar(stat="identity") + xlab("Hour of day") + ylab("Number of songs played") ``` Need to change the time zone using `date_time = with_tz(date_time, tz = "America/Los_Angeles")`: ```{r, echo=TRUE, message=TRUE, warning=TRUE, fig.width=8, fig.height=4.5} jukebox_hourly <- jukebox %>% mutate( date_time = parse_date_time(date_time, "%b %d %H%M%S %Y"), date_time = with_tz(date_time, tz = "America/Los_Angeles"), hour=hour(date_time) ) %>% group_by(hour) %>% summarise(count=n()) ggplot(data=jukebox_hourly, aes(x=hour, y=count)) + geom_bar(stat="identity") + xlab("Hour of day") + ylab("Number of songs played") ``` Your top 10 graveyard shift artists are not ```{r, echo=FALSE} top10_jukebox <- jukebox %>% # Cleaning the date variable mutate( clean_date = parse_date_time(date_time, "a b d HMS Y") ) %>% select(artist, clean_date) %>% # Filtering out months not in the academic year filter(month(clean_date) >= 9 | month(clean_date) <= 5) %>% # Filtering out times not in the graveyard shift filter(hour(clean_date) < 8 | hour(clean_date) == 8 & minute(clean_date) == 0 & second(clean_date) == 0) %>% # Group by artist and count group_by(artist) %>% count() %>% arrange(desc(n)) # Generating Top 10 Table: head(top10_jukebox, 10) %>% kable() ``` but are instead below. Not that different actually! ```{r, echo=FALSE} top10_jukebox <- jukebox %>% # Cleaning the date variable mutate( clean_date = parse_date_time(date_time, "a b d HMS Y"), clean_date = with_tz(clean_date , tz = "America/Los_Angeles") ) %>% select(artist, clean_date) %>% # Filtering out months not in the academic year filter(month(clean_date) >= 9 | month(clean_date) <= 5) %>% # Filtering out times not in the graveyard shift filter(hour(clean_date) < 8 | hour(clean_date) == 8 & minute(clean_date) == 0 & second(clean_date) == 0) %>% # Group by artist and count group_by(artist) %>% count() %>% arrange(desc(n)) # Generating Top 10 Table: head(top10_jukebox, 10) %>% kable() ```