############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2020-12-03 # # Topic(s): # - a look at the tidyverse (dplyr, magrittr, tibble, etc.) # # last updated: 2020-12-10 ############################################################################ # some potentially useful links: # https://education.rstudio.com # https://www.tidyverse.org # https://psyr.djnavarro.net # https://moderndive.com/ # https://r4ds.had.co.nz/ # https://www.datacamp.com/community/tutorials/pipe-r-tutorial ############################################################################ # install the tidyverse #install.packages("tidyverse") # load the tidyverse library(tidyverse) # set working directory (adjust to your own computer; in RStudio, go to the # Session menu, 'Set Working Directory', and 'To Source File Location') setwd("~/temp") # read in data (using the 'base R' way) dat <- read.delim("data_survey_edit.txt", na.strings="") # look at first 6 rows of dat head(dat) # figure out the class of 'dat' class(dat) # note: 'dat' is a data frame # read in data (using the 'tidyverse' way) dat <- read_delim("data_survey_edit.txt", delim="\t") # note: this uses readr (https://readr.tidyverse.org) # look at first 6 rows of dat head(dat) # figure out the class of 'dat' class(dat) # note: 'dat' is a a tibble (https://tibble.tidyverse.org) # basic idea of the difference between nested functions and piping sqrt(log(5)) 5 %>% log %>% sqrt # note: the pipe operator comes from magrittr (https://magrittr.tidyverse.org) # select a variable (note: this returns a tibble / data.frame) dat %>% select(pss) # pull out a variable (note: this returns a vector) dat %>% pull(pss) # could also do it this way, but this is unncessarily complex dat %>% select(pss) %>% pull() # pull out a variable and take the mean of it dat %>% pull(pss) %>% mean(na.rm=TRUE) # the 'base R' way of doing the same thing mean(dat$pss, na.rm=TRUE) # compute the mean of the male and female subjects dat %>% filter(sex == "male") %>% pull(pss) %>% mean(na.rm=TRUE) dat %>% filter(sex == "female") %>% pull(pss) %>% mean(na.rm=TRUE) # the 'base R' way of doing the same thing mean(dat$pss[dat$sex == "male"], na.rm=TRUE) mean(dat$pss[dat$sex == "female"], na.rm=TRUE) # another way using the subset() function mean(subset(dat, sex == "male", select = pss, drop = TRUE), na.rm=TRUE) mean(subset(dat, sex == "female", select = pss, drop = TRUE), na.rm=TRUE) # yet another way by(dat$pss, dat$sex, mean, na.rm=TRUE) # the tidyverse approach to the by() example dat %>% group_by(sex) %>% summarise(meanpss = mean(pss, na.rm=TRUE)) # the result is a tibble where values are printed with lots of rounding; # could pipe this further to as.data.frame() to turn it into a plain old # data frame where these numbers are printed with less rounding dat %>% group_by(sex) %>% summarise(meanpss = mean(pss, na.rm=TRUE)) %>% as.data.frame() # if we just need the means directly, can pull them out again with pull() dat %>% group_by(sex) %>% summarise(meanpss = mean(pss, na.rm=TRUE)) %>% pull(meanpss) # to get rid of that message wrt to ungrouping and put each step on its own line dat %>% group_by(sex) %>% summarise(meanpss = mean(pss, na.rm=TRUE), .groups="drop") %>% pull(meanpss) # remove the 'pss' variable from dat dat$pss <- NULL # compute the sum of the 10 PSS variables (the 'base R' way) #dat$pss <- dat$pss1 + dat$pss2 + ... #dat$pss <- with(dat, pss1 + pss2 + ...) dat$pss <- rowSums(dat[grep("pss", names(dat))]) # remove 'pss' variable from dat (the tidyverse way) dat <- dat %>% select(-pss) # compute the sum of the pss variables and add it to the tibble as a new variable dat$pss <- dat %>% select(starts_with("pss")) %>% mutate(pss = rowSums(.)) %>% pull(pss) # remove 'pss' variable again dat <- dat %>% select(-pss) # could also do it this way, but now we again have multiple nested functions dat <- dat %>% mutate(pss = rowSums(select(.,starts_with("pss")))) # make a copy of dat tmp <- dat # look at first 6 rows of tmp head(tmp) # reorder data frame by increasing age (the 'base R' way) tmp <- tmp[order(tmp$age),] # look at first 6 rows of tmp head(tmp) # make a copy of dat tmp <- dat # look at first 6 rows of tmp head(tmp) # reorder data frame by increasing age the tidyverse way tmp <- arrange(tmp, age) # look at first 6 rows of tmp head(tmp) # or with piping tmp <- dat tmp <- tmp %>% arrange(age) head(tmp) ############################################################################