############################################################################ # Course: Introduction to R # Author: Wolfgang Viechtbauer (https://www.wvbauer.com) # License: CC BY-NC-SA 4.0 # # last updated: 2024-02-08 ############################################################################ # restart the R session (Menu 'Session' - 'Restart R') # read in the code from rcode_helper.r source("rcode_helper.r") ############################################################################ # load the data load("data_survey_edit.rdata") # inspect the first 6 rows of the dataset head(dat) # open the codebook pdf via this script browseURL("data_survey.pdf") ############################################################################ # attach some description to variables (variable labels) # a simple solution: using comment() comment(dat$source) <- "Primary source of stress." comment(dat$lotr1) <- "In uncertain times, I usually expect the best." comment(dat$lotr2) <- "If something can go wrong for me, it will." comment(dat$lotr3) <- "I'm always optimistic about my future." comment(dat$lotr4) <- "I hardly ever expect things to go my way." comment(dat$lotr5) <- "I rarely count on good things happening to me." comment(dat$lotr6) <- "Overall, I expect more good things to happen to me than bad." # labels do not show up when looking at the dataset head(dat) # but we can extract them comment(dat$source) # or get all of them sapply(dat, comment) # only show labels for variables that have one res <- sapply(dat, comment) res <- res[!sapply(res, is.null)] res # print as a data frame res <- data.frame(label=unlist(res)) print(res, right=FALSE) # there are packages specifically for creating/extracting labels; for example: # # https://cran.r-project.org/package=tinylabels # https://cran.r-project.org/package=labelled ############################################################################ # psychometrics # https://en.wikipedia.org/wiki/Psychometrics head(dat) # install (if necessary) the 'psych' package and load it loadpkg(psych) # install (if necessary) the 'GPArotation' package and load it loadpkg(GPArotation) # Cronbach's alpha # https://en.wikipedia.org/wiki/Cronbach's_alpha # say we want to compute Cronbach's alpha for the lotr scale; then we just # need to pass the data for the 6 items of this scale to the alpha() function # from the psych package alpha(dat[c("lotr1", "lotr2", "lotr3", "lotr4", "lotr5", "lotr6")]) # but having to specify the variables (especially if there are many) in this # way is tedious; in rcode04.r, we used the grep() function to find the # position of variables in a dataset names(dat) grep("lotr", names(dat)) # careful: variable 68 is the sum score of the 6 items, which we do not want # to include in the computation of Cronbach's alpha; so we really need to # search for variable names that include 'lotr' followed by a number; grep() # actually searches for a 'regular expression', which allows for the # specification of very complex search patterns; for more details, see: # https://en.wikipedia.org/wiki/Regular_expression # can also read more about regular expressions as used in R help(regexp) # this will search for 'lotr' followed by a number between 0 and 9 grep("lotr[0-9]", names(dat)) # and now we can do alpha(dat[grep("lotr[0-9]", names(dat))]) # use the same trick to compute Cronbach's alpha for other scales alpha(dat[grep("mastery[0-9]", names(dat))]) alpha(dat[grep("pss[0-9]", names(dat))]) alpha(dat[grep("rses[0-9]", names(dat))]) # for the PANAS, we might have to do this manually alpha(dat[c("panas1", "panas4", "panas6", "panas7", "panas9", "panas12", "panas13", "panas15", "panas17", "panas18")]) alpha(dat[c("panas2", "panas3", "panas5", "panas8", "panas10", "panas11", "panas14", "panas16", "panas19", "panas20")]) # or could do something like this alpha(dat[paste0("panas", c(1,4,6,7,9,12,13,15,17,18))]) alpha(dat[paste0("panas", c(2,3,5,8,10,11,14,16,19,20))]) # scree plot # https://en.wikipedia.org/wiki/Scree_plot sub <- dat[grep("pss[0-9]", names(dat))] scree(sub, factors=FALSE) # parallel analysis # https://en.wikipedia.org/wiki/Parallel_analysis fa.parallel(sub, fa="fa", n.iter=1000, sim=FALSE) # principal component analysis (PCA) # https://en.wikipedia.org/wiki/Principal_component_analysis principal(sub, nfactors=2, rotate="oblimin") # exploratory factor analysis (EFA) using principal axis factoring (PAF) # https://en.wikipedia.org/wiki/Exploratory_factor_analysis fa(sub, nfactors=2, rotate="oblimin", fm="pa") ############################################################################ # the 'psych' package has a bunch of other useful functions; for example, the # describe() function provides summary statistics for all quantitative # variables in a dataset describe(dat, omit=TRUE) # the 'skimr' package also creates nice summaries of datasets # install (if necessary) the 'skimr' package and load it loadpkg(skimr) skim(dat) skim_without_charts(dat) # if there are problems with drawing the mini histograms # note: for character variables, the information provided is not so useful; # it helps to turn such variables into factors dat$sex <- factor(dat$sex) skim(dat) ############################################################################ # some built-in color palettes rm(list=ls()) load("data_survey_edit.rdata") par(mar=c(5,9,4,2)) boxplot(pss ~ marital, data=dat, col=rainbow(8), xlab="PSS", ylab="", pch=19, horizontal=TRUE, las=1, main="Perceived Stress by Marital Status", boxwex=0.6) # some of the built-in color palettes palette.pals() # plot 8 colors from each palette (and from rainbow) pals <- palette.pals() npals <- length(pals) cols <- 8 par(mar=c(2,8,2,2), las=1) plot(NA, NA, xlim=c(1,8), ylim=c(1,npals+1), xlab="", ylab="", xaxt="n", yaxt="n") axis(side=2, at=1:(npals+1), labels=c(pals, "Rainbow")) for (i in 1:npals) { points(1:cols, rep(i,cols), pch=19, cex=4, col=palette.colors(cols, palette=pals[i])) } points(1:cols, rep(i+1,cols), pch=19, cex=4, col=rainbow(cols)) # now pick one you like par(mar=c(5,9,4,2)) boxplot(pss ~ marital, data=dat, col=palette.colors(8, palette="Set 2"), xlab="PSS", ylab="", pch=19, horizontal=TRUE, las=1, main="Perceived Stress by Marital Status", boxwex=0.6) # palette.colors() is also useful if you quickly need a number of colors for # points or lines corresponding to different groups (e.g., in a scatterplot) # there are even more palettes hcl.pals() ############################################################################ # just in case, clear the workspace rm(list=ls()) # merge two datasets by a common id dat1 <- data.frame(id = c(1, 3, 4, 5, 7), age = c(30, 34, 28, 21, 29), sex = c("f", "m", "m", "f", "m")) dat2 <- data.frame(id = c(1, 2, 4, 5, 6), pss = c(29, 22, 19, 31, 27)) dat1 dat2 dat <- merge(dat1, dat2, by="id") dat dat <- merge(dat1, dat2, by="id", all.x=TRUE) dat dat <- merge(dat1, dat2, by="id", all=TRUE) dat # a more complex situation where dat2 includes (for some subjects) repeated # measurements of a particular variable dat2 <- data.frame(id = c(1,1,1,2,3,3,4,5,6,6), time = c(1,2,3,1,1,3,1,1,1,2), pss = c(30,27,22,25,36,22,33,29,39,32)) dat2 dat <- merge(dat1, dat2, all=TRUE, sort=FALSE) dat dat <- dat[order(dat$id, dat$time),] rownames(dat) <- NULL dat # combining two datasets containing different subjects (with some common # variables and possibly with some different variables) dat1 <- data.frame(id = c(1,2,3), pss = c(30,27,32), rses=c(34,NA,37)) dat2 <- data.frame(id = c(4,5), pss = c(22,18), smoker=c("no","yes")) dat1 dat2 # this does not work since the variables in the two datasets are not identical dat <- rbind(dat1, dat2) # create variables with NAs for variables not present in a dataset dat1[setdiff(names(dat2), names(dat1))] <- NA dat2[setdiff(names(dat1), names(dat2))] <- NA dat1 dat2 # now bind the rows together (note that the position of the variables does not # matter, since rbind() correctly figures out how to match things up) dat <- rbind(dat1, dat2) dat # note: dplyr::bind_rows(dat1, dat2) also works without having to first create # the variables with NAs ############################################################################ # restructure a dataset from wide to long format dat.wide <- data.frame(subj = 1:5, y1 = c(5,3,6,7,3), y2 = c(4,NA,6,5,4), y3 = c(2,3,4,4,1)) dat.wide dat.long <- reshape(dat.wide, direction="long", varying=list(2:4), v.names="y", timevar="timepoint", idvar="id") dat.long dat.long <- dat.long[order(dat.long$id),] dat.long dat.long$id <- NULL rownames(dat.long) <- NULL dat.long # restructure a dataset from long to wide format dat <- reshape(dat.long, direction="wide", idvar="subj", timevar="timepoint", v.names="y", sep="") dat rownames(dat) <- NULL dat # the reshape package (https://cran.r-project.org/package=reshape) is also # supposed to be very good for this kind of restructuring ############################################################################ # per-group operations in long format datasets dat.long # get the mean of y for each subject (note: this automatically handles NAs) aggregate(y ~ subj, data=dat.long, FUN=mean) # this essentially does the same as using by(), but by() returns a vector, # while aggregate returns a data frame by(dat.long$y, dat.long$subj, mean) c(by(dat.long$y, dat.long$subj, mean, na.rm=TRUE)) # add the mean of y for each subject to the dataset dat.long$ym <- ave(dat.long$y, dat.long$subj, FUN=mean) dat.long # unfortunately, cannot use na.rm=TRUE with ave(), so we have to write our own # little function to compute the mean with na.rm set to TRUE automatically dat.long$ym <- ave(dat.long$y, dat.long$subj, FUN = function(x) mean(x, na.rm=TRUE)) dat.long ############################################################################ # dichotomizing / categorizing / collapsing rm(list=ls()) # load the data load("data_survey_edit.rdata") # dichotomize a quantitative variable ('median split') dat$highpa <- ifelse(dat$posaff > median(dat$posaff), 1, 0) head(dat) table(dat$highpa) table(dat$posaff, dat$highpa) # categorize a quantitative variable dat$pss table(dat$pss) dat$pss.lvl <- ifelse(dat$pss > 25, 1, 0) table(dat$pss.lvl) dat$pss.lvl <- cut(dat$pss, breaks=c(0, 20, 30, 40, 50)) table(dat$pss.lvl) # (0,20] means 'just above 0 to 20 inclusive' dat$pss.lvl <- cut(dat$pss, breaks=c(0, 20, 30, 40, 50), labels=1:4) table(dat$pss.lvl) dat$pss.lvl <- cut(dat$pss, breaks=c(0, 20, 30, 40, 50), labels=c("low", "medium", "high", "very high")) table(dat$pss.lvl) # collapse some levels of a categorical variable dat$stress <- dat$source table(dat$stress, useNA="always") dat$stress <- ifelse(dat$stress == "children", "interpers", dat$stress) dat$stress <- ifelse(dat$stress == "family", "interpers", dat$stress) dat$stress <- ifelse(dat$stress == "friendships", "interpers", dat$stress) dat$stress <- ifelse(dat$stress == "spouse/partner", "interpers", dat$stress) table(dat$stress, useNA="always") # a shorter way dat$stress <- NULL dat$stress <- ifelse(dat$source %in% c("children", "family", "friendships", "spouse/partner"), "interpers", dat$source) table(dat$stress, useNA="always") # collapse it further dat$stress <- ifelse(dat$stress %in% c("health/illness", "lack of time", "life in general", "money/finances"), "other", dat$stress) table(dat$stress, useNA="always") ############################################################################ rm(list=ls()) # change more than 2 categories into color names dat <- data.frame(x = c("grp1", "grp2", "grp2", "grp3", NA, "grp1")) dat # using nested if-else statements ifelse(dat$x == "grp1", "red", ifelse(dat$x == "grp2", "blue", "green")) # using as.numeric(factor()) c("red", "blue", "green")[as.numeric(factor(dat$x, levels=c("grp1", "grp2", "grp3")))] # can even leave out the as.numeric() c("red", "blue", "green")[factor(dat$x, levels=c("grp1", "grp2", "grp3"))] # can even leave out the 'levels' part c("red", "blue", "green")[factor(dat$x)] # using with() together with {} and base code with(dat, { cols <- NA cols[x == "grp1"] <- "red" cols[x == "grp2"] <- "blue" cols[x == "grp3"] <- "green" cols }) # this also works nicely when multiple groups should receive the same color name with(dat, { cols <- NA cols[x %in% c("grp1", "grp2")] <- "red" cols[x %in% "grp3"] <- "green" cols }) # using switch() with sapply() sapply(dat$x, switch, grp1 = "red", grp2 = "blue", grp3 = "green", NA ) # using car::recode() loadpkg(car) recode(dat$x , " 'grp1' = 'red'; 'grp2' = 'blue'; 'grp3' = 'green' ") # using dplyr::case_when() loadpkg(dplyr) with(dat, case_when( x == "grp1" ~ "red", x == "grp2" ~ "blue", x == "grp3" ~ "green" )) # let's put this into practice with a real example loadpkg(palmerpenguins) # https://allisonhorst.github.io/palmerpenguins/ head(penguins) # notes: # - the dataset is a tibble # - species is already a factor penguins$species plot(bill_length_mm ~ flipper_length_mm, data=penguins, xlab="Flipper length (mm)", ylab="Bill length (mm)", bty="l", pch=19, col=c("darkorange","purple","cyan4")[species]) # so Adelie -> darkorange, Chinstrap -> purple, Gentoo -> cyan4 # same idea also works for using different plotting symbols for the groups plot(bill_length_mm ~ flipper_length_mm, data=penguins, pch=NA, xlab="Flipper length (mm)", ylab="Bill length (mm)", bty="l") grid() points(bill_length_mm ~ flipper_length_mm, data=penguins, pch=c(19,17,15)[species], col=c("darkorange","purple","cyan4")[species]) # note: we do not have to use penguins$species when using the 'data' argument legend("bottomright", pch=c(19,17,15), col=c("darkorange","purple","cyan4"), legend=c("Adelie","Chinstrap","Gentoo"), bty="n", title="Penguin species") ############################################################################ # string manipulation # pasting together things as strings paste("R", "is so awesome!") paste("Mean flipper length:", round(mean(penguins$flipper_length_mm, na.rm=TRUE), 2)) # say the 'id' variable includes the subject initials and their year of birth id <- c("DB1965", "PL1967", "ES1975") id # want to extract the year values substr(id, start=3, stop=6) as.numeric(substr(id, start=3, stop=6)) # but say that some subjects have more than two initials id <- c("DB1965", "PLK1967", "ES1975") id # then the code above doesn't work as.numeric(substr(id, start=3, stop=6)) # check the string lengths nchar(id) # assuming that the last 4 values are always the years values, we can use the # string length to determine where we need to start extracting substrings nchar(id)-3 substr(id, start=nchar(id)-3, stop=nchar(id)) as.numeric(substr(id, start=nchar(id)-3, stop=nchar(id))) # some packages for string manipulations: # https://cran.r-project.org/package=stringi # https://cran.r-project.org/package=stringr ############################################################################ # multilevel / mixed-effects modeling # https://en.wikipedia.org/wiki/Multilevel_model # https://en.wikipedia.org/wiki/Mixed_model # nlme package: # - older, no further development # - only for general linear mixed-effects models # - handles serial correlation & heteroscedastic errors # # lme4 package: # - newer, still under active development # - handles large data sets more efficiently # - general and generalized linear mixed-effects models # - handles crossed random effects more easily # # both also handle non-linear models # (there are other packages, but these are the most popular) library(nlme) head(Orthodont) help(Orthodont) # random intercepts and slopes model res <- lme(distance ~ age * Sex, random = ~ 1 + age | Subject, data=Orthodont) summary(res) ############################################################################ # survival analysis # https://en.wikipedia.org/wiki/Survival_analysis rm(list=ls()) library(survival) head(leukemia) help(leukemia) # Kaplan-Meier estimator / plot # https://en.wikipedia.org/wiki/Kaplan–Meier_estimator res <- survfit(Surv(time, status) ~ x, data = leukemia) summary(res) # Kaplan-Meier plot with 95% CI plot(res, lty = c("dotted", "dashed"), lwd=3) legend("topright", inset=.02, c("Maintained","Nonmaintained"), lty = c("dotted","dashed"), lwd=3) # Kaplan-Meier plot with 95% CI plot(res, col=c("blue","red"), lwd=3) lines(res, conf.int=TRUE, col=c("blue","red"), lty="dotted") # add confidence intervals legend("topright", inset=.02, c("Maintained","Nonmaintained"), lty = c("dotted","dashed"), lwd=3, bg="white") # log-rank test # https://en.wikipedia.org/wiki/Logrank_test survdiff(Surv(time, status) ~ x, data=leukemia) # Cox proportional hazards regression model # https://en.wikipedia.org/wiki/Proportional_hazards_model res <- coxph(Surv(time, status) ~ x, data=leukemia) summary(res) # predicted survivor function # https://en.wikipedia.org/wiki/Survival_function newdat <- data.frame(x=c("Maintained","Nonmaintained")) pred <- summary(survfit(res, newdata=newdat)) pred plot(pred$surv[,1] ~ pred$time, ylim=c(0,1), type="s", col="blue") lines(pred$surv[,2] ~ pred$time, type="s", col="red") legend("topright", legend=c("Maintained","Nonmaintained"), lty="solid", col=c("blue","red"), inset=.02) # testing the proportional hazards assumption cox.zph(res) ############################################################################ # how to replace missing values (e.g., with the mean) rm(list=ls()) load("data_survey_edit.rdata") dat$pss # mean imputation (not a good method ...) dat$pss[is.na(dat$pss)] <- mean(dat$pss, na.rm=TRUE) dat$pss ############################################################################ # multiple imputation # https://en.wikipedia.org/wiki/Imputation_(statistics)#Multiple_imputation rm(list=ls()) loadpkg(mice) load("data_survey_edit.rdata") sub <- dat[c("pss", "age", "smoke", "rses")] # lm() does listwise deletion when there are missing values res <- lm(pss ~ age + smoke + rses, data=sub) summary(res) # set up predictor matrix (1 means that the variable in the column is used to # predict the variable in the row, a 0 means that it is not used) predMatrix <- make.predictorMatrix(sub) predMatrix # set up imputation methods vector (pmm = 'predictive mean matching') impMethod <- make.method(sub) impMethod # run mice imp <- mice(sub, method = impMethod, predictorMatrix = predMatrix, seed = 12345) # fit model in each imputed dataset fit <- with(imp, lm(pss ~ age + smoke + rses)) # pool results pool <- pool(fit) summary(pool) # a nice book about multiple imputation and the mice package: van Buuren, S. # (2018) Flexible imputation of missing data (2nd ed.). Chapman & Hall. # # freely available here: https://stefvanbuuren.name/fimd/ ############################################################################ # how to change the position of a variable in a dataset rm(list=ls()) load("data_survey_edit.rdata") head(dat) # move 'lotr' variable after 'lotr6' item loadpkg(dplyr) dat <- relocate(dat, "lotr", .after = "lotr6") head(dat) # note: this could be done without loading any additional packages, but the # code to do this in an elegant/flexible way gets a bit cumbersome ############################################################################ # how to create an 'axis break' # just for illustration purposes, make two subjects in the dataset really old tmp <- dat tmp$age[1] <- 229 tmp$age[2] <- 237 plot(tmp$age, tmp$pss, pch=19, bty="n", xlab="Age", ylab="Stress", xlim=c(0,250), ylim=c(10,50)) library(plotrix) gap.plot(tmp$age, tmp$pss, pch=19, xlab="Age", ylab="Stress", xlim=c(0,250), ylim=c(10,50), bty="n", gap=c(104,218), gap.axis="x", xtics=c(0,25,50,75,100,225,250)) ############################################################################ # how to obtain standardized regression coefficients res <- lm(posaff ~ lotr + rses, data=dat) summary(res) # create a data frame which includes the variable we need for our model and # has all rows with at least one missing value removed ('listwise deletion') tmp <- na.omit(dat[c("posaff", "lotr", "rses")]) # check that we get the same results as before res <- lm(posaff ~ lotr + rses, data=tmp) summary(res) # scale (z-score) the outcome and the predictor variables res <- lm(scale(posaff) ~ scale(lotr) + scale(rses), data=tmp) summary(res) # remove the intercept term from the model (which must be 0 here) res <- lm(scale(posaff) ~ 0 + scale(lotr) + scale(rses), data=tmp) summary(res) # note: we create the 'tmp' dataset above, since scale() standardizes each # variable seprately and if there are different sets of missings in the # variables, then the variables would be standardized based on different # subsets of the data; by first creating 'tmp', we know that all variables are # standardized based on the same set of subjects (with complete data on the # variables of interest) ############################################################################ # creating a 'Table 1' for a paper rm(list=ls()) load("data_survey_edit.rdata") # install (if necessary) the 'tableone' and 'labelled' packages and load them loadpkg(tableone) loadpkg(labelled) # attach some variable labels to some variables in the dataset var_label(dat) <- list(age="Age", marital="Marital Status", educ="Educational Level", pss="PSS") # create Table 1 and print it tab <- CreateTableOne(vars=c("age", "marital", "educ", "pss"), strata="sex" , data=dat) print(tab, varLabels=TRUE) # instead of testing, show standardized mean differences print(tab, varLabels=TRUE, test=FALSE, smd=TRUE) # turn marital into a factor and set the reference level dat$marital <- relevel(factor(dat$marital), ref="single") # recreate the table tab <- CreateTableOne(vars=c("age", "marital", "educ", "pss"), strata="sex" , data=dat) print(tab, varLabels=TRUE, test=FALSE, smd=TRUE) # turn educ into a factor and set the levels in the desired order dat$educ <- factor(dat$educ) levels(dat$educ) levels(dat$educ)[c(3,5,2,4,6,1)] dat$educ <- factor(dat$educ, levels=levels(dat$educ)[c(3,5,2,4,6,1)]) levels(dat$educ) # recreate the table tab <- CreateTableOne(vars=c("age", "marital", "educ", "pss"), strata="sex" , data=dat) print(tab, varLabels=TRUE, test=FALSE, smd=TRUE) # save table as a csv file out <- print(tab, varLabels=TRUE, test=FALSE, smd=TRUE, quote=FALSE) write.csv(out, "table1.csv") # can now import this into Excel, edit if needed, and copy-paste into Word # the 'table1' package also provides this kind of functionality loadpkg(table1) label(dat$age) <- "Age" label(dat$marital) <- "Marital Status" label(dat$educ) <- "Educational Level" label(dat$pss) <- "PSS" table1(~ age + marital + educ + pss | sex, data=dat) # this creates an HTML table, which one can copy-paste into Word ############################################################################ # some other techniques/packages/functions: # - generalized estimating equations: gee, geepack, glmtoolbox # - time series: ts(), arima() # - bootstrapping: boot # - CFA/SEM: lavaan, sem # - machine learning: nnet, rpart, lasso2, lars, glmnet, elasticnet, randomForest, ... # # non-parametric methods: # - cor() with arguments method="kendall" or method="spearman" # - wilcox.test() – Wilcoxon rank sum and signed rank tests # - kruskal.test() – Kruskal-Wallis rank sum test # - friedman.test() – Friedman rank sum test ############################################################################ # confirmatory factor analysis (CFA) / structural equation modeling (SEM) # https://en.wikipedia.org/wiki/Confirmatory_factor_analysis # https://en.wikipedia.org/wiki/Structural_equation_modeling # install (if necessary) the 'lavaan' package and load it loadpkg(lavaan) # load the data load("data_survey_edit.rdata") # fit a one-factor CFA model to the items of the PSS scale model1 <- 'PSS =~ pss1 + pss2 + pss3 + pss4 + pss5 + pss6 + pss7 + pss8 + pss9 + pss10' res1 <- cfa(model1, data=dat, estimator="ML", std.lv=TRUE) summary(res1, fit.measures=TRUE, standardized=TRUE) # fit a two-factor CFA model to the items of the PSS scale model2 <- ' PSSpos =~ pss1 + pss2 + pss3 + pss6 + pss9 + pss10 PSSneg =~ pss4 + pss5 + pss7 + pss8' res2 <- cfa(model2, data=dat, estimator="ML", std.lv=TRUE) summary(res2, fit.measures=TRUE, standardized=TRUE) # LRT comparing the fit of the two models anova(res1, res2) # an example of a structural equation model model <- ' # measurement model RSES =~ rses1 + rses2 + rses3 + rses4 + rses5 + rses6 + rses7 + rses8 + rses9 + rses10 PSSpos =~ pss1 + pss2 + pss3 + pss6 + pss9 + pss10 PSSneg =~ pss4 + pss5 + pss7 + pss8 LOTR =~ lotr1 + lotr2 + lotr3 + lotr4 + lotr5 + lotr6 # regressions PSSpos ~ RSES PSSneg ~ RSES LOTR ~ PSSpos + PSSneg' res <- sem(model, data=dat, estimator="ML", std.lv=TRUE) summary(res, fit.measures=TRUE, standardized=TRUE) # plot the model loadpkg(lavaanPlot) lavaanPlot(model=res, coef=TRUE, stars="regress") # packages semPlot (https://cran.r-project.org/package=semPlot) and tidySEM # (https://cran.r-project.org/package=tidySEM) also provide functions for # plotting SEM models ############################################################################ # we have mostly been using 'base R graphics' throughout this course; but # there are other plotting systems, most notably the 'ggplot2' package # restart the R session (Menu 'Session' - 'Restart R') # read in the code from rcode_helper.r source("rcode_helper.r") # load the data load("data_survey_edit.rdata") dat$stress <- ifelse(dat$source %in% c("children", "family", "friendships", "spouse/partner"), "interpers", dat$source) dat$stress <- ifelse(dat$stress %in% c("health/illness", "lack of time", "life in general", "money/finances"), "other", dat$stress) sub <- na.omit(dat[c("stress", "pss", "posaff")]) # based on: https://gist.github.com/andrewheiss/20766a3e7c2c03db48a4c54b5b5fdf39 loadpkg(ggplot2) loadpkg(ggridges) # for geom_density_ridges() loadpkg(patchwork) # for combining multiple plots loadpkg(ggthemes) # for lots of themes including theme_few() # note: theme_gray() is default / theme_few(), theme_bw(), and theme_classic() are also nice theme_set(theme_few() + theme(plot.title = element_text(face = "bold"))) p1 <- ggplot(sub, aes(x = pss)) + geom_histogram(binwidth = 2, color = "white", bg = "gray60") + labs(title = "1: geom_histogram()") p2 <- ggplot(sub, aes(x = stress, fill = stress)) + geom_bar() + labs(title = "2: geom_bar()") + guides(fill = "none") p3 <- ggplot(sub, aes(x = pss, y = posaff)) + geom_point() + geom_smooth() + labs(title = "3: geom_point() + geom_smooth()") p4 <- ggplot(sub, aes(x = pss, y = posaff)) + geom_hex() + guides(fill = "none") + labs(title = "4: geom_hex()") p5 <- ggplot(sub, aes(x = pss, y = posaff)) + geom_density_2d() + labs(title = "5: geom_density_2d()") p6 <- ggplot(sub, aes(x = pss, y = stress, fill = stress)) + geom_boxplot() + guides(fill = "none") + labs(title = "6: geom_boxplot()") p7 <- ggplot(sub, aes(x = pss, y = stress, fill = stress)) + geom_violin() + guides(fill = "none") + labs(title = "7: geom_violin()") p8 <- ggplot(sub, aes(x = pss, fill = stress)) + geom_density(alpha = 0.5) + guides(fill = "none") + labs(title = "8: geom_density()") p9 <- ggplot(sub, aes(x = pss, y = stress, fill = stress)) + geom_density_ridges() + guides(fill = "none") + labs(title = "9: ggridges::geom_density_ridges()") p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8 + p9 + plot_layout(nrow = 3) # see: https://ggplot2-book.org # https://ggplot2.tidyverse.org ############################################################################ # Some other notes about the tidyverse: https://www.tidyverse.org # The tidyverse is a collection of R packages that provide an integrated # workflow for various things you want to do in data science. They also # provide some new data structures (i.e., 'tibbles') and a different syntax # that heavily relies on using 'pipes'. To illustrate, let's first install (if # necessary) and load the 'dplyr' package. loadpkg(dplyr) # Say you want to compute the mean of the 'pss' variable in the survey # dataset. The 'base R' way of doing this would be as follows. load("data_survey_edit.rdata") mean(dat$pss, na.rm=TRUE) # The following is an example of using pipes and using a function called # pull() that accomplishes the same thing. dat %>% pull(pss) %>% mean(na.rm=TRUE) # The idea is that we can read this left to right: dat is passed to pull(), # which (as the name says) pulls the pss variable from dat, which we then pass # to the mean() function. # Or say we want the mean pss values for the male and female subjects. Using # 'base R', we could accomplish this as follows. mean(dat$pss[dat$sex == "male"], na.rm=TRUE) mean(dat$pss[dat$sex == "female"], na.rm=TRUE) # Instead, we could use pipes and the filter() and pull() functions. dat %>% filter(sex == "male") %>% pull(pss) %>% mean(na.rm=TRUE) dat %>% filter(sex == "female") %>% pull(pss) %>% mean(na.rm=TRUE) # Often, sequences using pipes are split up across multiple lines. dat %>% filter(sex == "male") %>% pull(pss) %>% mean(na.rm=TRUE) # To learn more, see https://www.tidyverse.org. These RStudio 'cheatsheets' # are also quite nice and focused on tidyverse packages (but not exclusively): # https://posit.co/resources/cheatsheets/ ############################################################################ # note that a 'pipe operator' was also added to 'base R' (the 'native pipe') dat |> pull(pss) |> mean(na.rm=TRUE) # pull() is a function from dplyr; if we want to do thing entirely in 'base R' # without a tidyverse package, we could do the following dat |> _$pss |> mean(na.rm=TRUE) # this looks a bit weird; a bit nicer in my opinion is this dat |> with(mean(pss, na.rm=TRUE)) ############################################################################ # where to go from here? maybe: https://www.bigbookofr.com ############################################################################