############################################################################ # restart the R session (Menu 'Session' - Restart R) ############################################################################ # psychometrics # https://en.wikipedia.org/wiki/Psychometrics # read in data load("data_survey_edit.rdata") head(dat) # install (if necessary) the 'psych' package and load it if (!suppressWarnings(require(psych))) install.packages("psych") library(psych) # install (if necessary) the 'GPArotation' package if (!suppressWarnings(require(GPArotation))) install.packages("GPArotation") # Cronbach's alpha # https://en.wikipedia.org/wiki/Cronbach's_alpha names(dat) grep("lotr", names(dat)) grep("lotr[0-9]", names(dat)) alpha(dat[grep("lotr[0-9]", names(dat))]) alpha(dat[grep("mastery[0-9]", names(dat))]) alpha(dat[grep("pss[0-9]", names(dat))]) alpha(dat[grep("rses[0-9]", names(dat))]) alpha(dat[c("panas1", "panas4", "panas6", "panas7", "panas9", "panas12", "panas13", "panas15", "panas17", "panas18")]) alpha(dat[c("panas2", "panas3", "panas5", "panas8", "panas10", "panas11", "panas14", "panas16", "panas19", "panas20")]) # scree plot # https://en.wikipedia.org/wiki/Scree_plot sub <- dat[grep("pss[0-9]", names(dat))] scree(sub, factors=FALSE) # parallel analysis # https://en.wikipedia.org/wiki/Parallel_analysis fa.parallel(sub, fa="pc", n.iter=1000, sim=FALSE) # principal component analysis (PCA) # https://en.wikipedia.org/wiki/Principal_component_analysis principal(sub, nfactors=2, rotate="oblimin") # exploratory factor analysis (EFA) using principal axis factoring (PAF) # https://en.wikipedia.org/wiki/Exploratory_factor_analysis fa(sub, nfactors=2, rotate="oblimin", fm="pa") ############################################################################ # heatmaps # https://en.wikipedia.org/wiki/Heat_map rm(list=ls()) # install (if necessary) the 'viridis' package and load it if (!suppressWarnings(require(viridis))) install.packages("viridis") library(viridis) # read in data load("data_survey_edit.rdata") # create a dataset for which we want to create a heatmap mat <- cor(dat[c("age", "lotr", "mastery", "pss", "rses", "posaff", "negaff")], use = "complete.obs") mat dat <- as.data.frame(mat) rownames(dat) <- 1:nrow(dat) dat <- cbind(var=colnames(dat), dat) dat # remove first column dat[2:8] dat[-1] # a basic heatmap heatmap(dat[-1]) # heatmap doesn't take data frames as input; need to supply a matrix heatmap(as.matrix(dat[-1])) # add row names based on 'var' variable heatmap(as.matrix(dat[-1]), labRow=dat$var) # switch to viridis color scheme heatmap(as.matrix(dat[-1]), col=viridis(50), labRow=dat$var) # reverse the order of the columns heatmap(as.matrix(dat[-1]), col=viridis(50), labRow=dat$var, revC=TRUE) # no dendograms heatmap(as.matrix(dat[-1]), col=viridis(50), labRow=dat$var, revC=TRUE, Rowv=NA, Colv=NA) # more space for margins (default is c(5,5)) heatmap(as.matrix(dat[-1]), col=viridis(50), labRow=dat$var, revC=TRUE, Rowv=NA, Colv=NA, margins=c(7,7)) # don't rescale values heatmap(as.matrix(dat[-1]), col=viridis(50), labRow=dat$var, revC=TRUE, Rowv=NA, Colv=NA, margins=c(7,7), scale="none") # add a legend (manually) heatmap(as.matrix(dat[-1]), col=viridis(50), labRow=dat$var, revC=TRUE, Rowv=NA, Colv=NA, margins=c(7,16), scale="none") legend("right", fill=viridis(21), legend=round(seq(min(dat[-1]),max(dat[-1]),length=21), 2)) # install (if necessary) the 'pheatmap' package and load it if (!suppressWarnings(require(pheatmap))) install.packages("pheatmap") library(pheatmap) pheatmap(as.matrix(dat[-1]), col=viridis(50), labels_row=dat$var) pheatmap(as.matrix(dat[-1]), col=viridis(50), labels_row=dat$var, fontsize=16) pheatmap(as.matrix(dat[-1]), col=viridis(50), fontsize=16, labels_row=paste(" ", dat$var, " "), labels_col=paste(" ", dat$var, " ")) pheatmap(as.matrix(dat[-1]), col=viridis(50), fontsize=16, labels_row=paste(" ", dat$var, " "), labels_col=paste(" ", dat$var, " "), cluster_rows=FALSE, cluster_col=FALSE) # install (if necessary) the 'heatmaply' package and load it if (!suppressWarnings(require(heatmaply))) install.packages("heatmaply") library(heatmaply) heatmaply(dat[-1]) ggheatmap(dat[-1]) ############################################################################ # plot of intensity at x and y coordinates set.seed(1234) n <- 200 dat <- data.frame(x = runif(n), y = runif(n)) dat$intensity <- 0.55 - 1 * (dat$x-0.7)^2 + 1 * (dat$y-0.2)^3 + rnorm(n, 0, .04) head(dat) range(dat$intensity) # install (if necessary) the 'akima' package and load it if (!suppressWarnings(require(akima))) install.packages("akima") library(akima) # interpolate points res <- interp(dat$x, dat$y, dat$intensity) filled.contour(res, color=hcl.colors, xlab="x", ylab="y") res$z[is.na(res$z)] <- 0 filled.contour(res, color=hcl.colors, xlab="x", ylab="y") filled.contour(res, color=hcl.colors, xlab="x", ylab="y", nlevels=100) # something really fancy if (!suppressWarnings(require(plotly))) install.packages("plotly") library(plotly) plot_ly(x = ~ res$x, y = ~ res$y, z = ~ res$z) %>% add_surface( contours = list( z = list( show = TRUE, usecolormap = TRUE, highlightcolor = "#ff0000", project = list(z=TRUE) ) ) ) %>% layout( scene = list( camera = list( eye = list(x=1.87, y=0.88, z=-0.64) ) ) ) ############################################################################ # just in case, clear the workspace rm(list=ls()) # merge two datasets by a common id dat1 <- data.frame(id = c(1, 3, 4, 5, 7), age = c(30, 34, 28, 21, 29), sex = c("f", "m", "m", "f", "m")) dat2 <- data.frame(id = c(1, 2, 4, 5, 6), pss = c(29, 22, 19, 31, 27)) dat1 dat2 dat <- merge(dat1, dat2, by="id") dat dat <- merge(dat1, dat2, by="id", all.x=TRUE) dat dat <- merge(dat1, dat2, by="id", all=TRUE) dat ############################################################################ # restructure a dataset from wide to long format dat.wide <- data.frame(subj = 1:5, y1 = c(5,3,6,7,3), y2 = c(4,NA,6,5,4), y3 = c(2,3,4,4,1)) dat.wide dat.long <- reshape(dat.wide, direction="long", varying=list(2:4), v.names="y", timevar="week", idvar="id") dat.long dat.long <- dat.long[order(dat.long$id),] dat.long dat.long$id <- NULL rownames(dat.long) <- 1:nrow(dat.long) dat.long # restructure a dataset from long to wide format reshape(dat.long, direction="wide", idvar="subj", timevar="week", v.names = "y", sep="") ############################################################################ # per-group operations in long format datasets dat.long # get the mean of y for each subject aggregate(dat.long$y, by=list(dat.long$subj), FUN=mean) aggregate(dat.long$y, by=list(dat.long$subj), FUN=mean, na.rm=TRUE) by(dat.long$y, dat.long$subj, mean) c(by(dat.long$y, dat.long$subj, mean, na.rm=TRUE)) # add the mean of y for each subject to the dataset dat.long$ym <- ave(dat.long$y, dat.long$subj, FUN=mean) dat.long dat.long$ym <- ave(dat.long$y, dat.long$subj, FUN = function(x) mean(x, na.rm=TRUE)) dat.long ############################################################################ # dichotomizing / categorizing / collapsing rm(list=ls()) # read in data load("data_survey_edit.rdata") # dichotomize a quantitative variable ('median split') dat$highpa <- ifelse(dat$posaff > median(dat$posaff), 1, 0) head(dat) table(dat$highpa) table(dat$posaff, dat$highpa) # categorize a quantitative variable dat$pss table(dat$pss) dat$pss.lvl <- ifelse(dat$pss > 25, 1, 0) table(dat$pss.lvl) dat$pss.lvl <- cut(dat$pss, breaks=c(0, 20, 30, 40, 50)) table(dat$pss.lvl) # (0,20] means 'just above 0 to 20 inclusive' dat$pss.lvl <- cut(dat$pss, breaks=c(0, 20, 30, 40, 50), labels=1:4) table(dat$pss.lvl) dat$pss.lvl <- cut(dat$pss, breaks=c(0, 20, 30, 40, 50), labels=c("low", "medium", "high", "very high")) table(dat$pss.lvl) # collapse some levels of a categorical variable dat$stress <- dat$source table(dat$stress, useNA="always") dat$stress <- ifelse(dat$stress == "children", "interpers", dat$stress) dat$stress <- ifelse(dat$stress == "family", "interpers", dat$stress) dat$stress <- ifelse(dat$stress == "friendships", "interpers", dat$stress) dat$stress <- ifelse(dat$stress == "spouse/partner", "interpers", dat$stress) table(dat$stress, useNA="always") # a shorter way dat$stress <- NULL dat$stress <- ifelse(dat$source %in% c("children", "family", "friendships", "spouse/partner"), "interpers", dat$source) table(dat$stress, useNA="always") # collapse it further dat$stress <- ifelse(dat$stress %in% c("health/illness", "lack of time", "life in general", "money/finances"), "other", dat$stress) table(dat$stress, useNA="always") ############################################################################ rm(list=ls()) # change more than 2 categories into color names dat <- data.frame(x = c("grp1", "grp2", "grp2", "grp3", NA, "grp1"), stringsAsFactors = FALSE) dat # using nested if-else statements ifelse(dat$x == "grp1", "red", ifelse(dat$x == "grp2", "blue", "green")) # using as.numeric(factor()) c("red", "blue", "green")[as.numeric(factor(dat$x, levels=c("grp1", "grp2", "grp3")))] # using with() together with {} and base code with(dat, { cols <- NA cols[x == "grp1"] <- "red" cols[x == "grp2"] <- "blue" cols[x == "grp3"] <- "green" cols }) # this also works nicely when multiple groups should receive the same color name with(dat, { cols <- NA cols[x %in% c("grp1", "grp2")] <- "red" cols[x %in% "grp3"] <- "green" cols }) # using switch() with sapply() sapply(dat$x, switch, grp1 = "red", grp2 = "blue", grp3 = "green", NA ) # using car::recode() if (!suppressWarnings(require(car))) install.packages("car") library(car) recode(dat$x , " 'grp1' = 'red'; 'grp2' = 'blue'; 'grp3' = 'green' ") # using dplyr::case_when() if (!suppressWarnings(require(dplyr))) install.packages("dplyr") library(dplyr) with(dat, case_when( x == "grp1" ~ "red", x == "grp2" ~ "blue", x == "grp3" ~ "green" )) ############################################################################ # string manipulation id <- c("DB1965", "PL1967", "ES1975") id substr(id, start=3, stop=6) as.numeric(substr(id, start=3, stop=6)) id <- c("DB1965", "PLK1967", "ES1975") id as.numeric(substr(id, start=3, stop=6)) nchar(id) nchar(id)-3 substr(id, start=nchar(id)-3, stop=nchar(id)) as.numeric(substr(id, start=nchar(id)-3, stop=nchar(id))) ############################################################################ # multilevel / mixed-effects modeling # https://en.wikipedia.org/wiki/Multilevel_model # https://en.wikipedia.org/wiki/Mixed_model # nlme package: # - older, no further development # - only for general linear mixed-effects models # - handles serial correlation & heteroscedastic errors # # lme4 package: # - newer, still under active development # - handles large data sets more efficiently # - general and generalized linear models # - handles crossed random effects more easily # # both also handle non-linear models # (there are other packages, but these are the most popular) library(nlme) data(Orthodont) Orthodont res <- lme(distance ~ age * Sex, random = ~ age | Subject, data=Orthodont) summary(res) ############################################################################ # survival analysis # https://en.wikipedia.org/wiki/Survival_analysis rm(list=ls()) library(survival) data(leukemia) head(leukemia) help(leukemia) # Kaplan-Meier estimator / plot # https://en.wikipedia.org/wiki/Kaplan–Meier_estimator res <- survfit(Surv(time, status) ~ x, data = leukemia) summary(res) # Kaplan-Meier plot with 95% CI plot(res, lty = c("dotted", "dashed"), lwd=3) legend("topright", inset=.02, c("Maintained","Nonmaintained"), lty = c("dotted","dashed"), lwd=3) # Kaplan-Meier plot with 95% CI plot(res, col=c("blue","red"), lwd=3) lines(res, conf.int=TRUE, col=c("blue","red"), lty="dotted") # add confidence intervals legend("topright", inset=.02, c("Maintained","Nonmaintained"), lty = c("dotted","dashed"), lwd=3, bg="white") # log-rank test # https://en.wikipedia.org/wiki/Logrank_test survdiff(Surv(time, status) ~ x, data=leukemia) # Cox proportional hazards regression model # https://en.wikipedia.org/wiki/Proportional_hazards_model res <- coxph(Surv(time, status) ~ x, data=leukemia) summary(res) # predicted survivor function # https://en.wikipedia.org/wiki/Survival_function newdat <- data.frame(x=c("Maintained","Nonmaintained")) pred <- summary(survfit(res, newdata=newdat)) pred plot(pred$surv[,1] ~ pred$time, ylim=c(0,1), type="s", col="blue") lines(pred$surv[,2] ~ pred$time, type="s", col="red") legend("topright", legend=c("Maintained","Nonmaintained"), lty="solid", col=c("blue","red"), inset=.02) # testing the proportional hazards assumption cox.zph(res) ############################################################################ # multiple imputation # https://en.wikipedia.org/wiki/Imputation_(statistics)#Multiple_imputation rm(list=ls()) if (!suppressWarnings(require(mice))) install.packages("mice") library(mice) load("data_survey_edit.rdata") sub <- dat[c("pss", "age", "smoke", "rses")] # lm() does listwise deletion when there are missing values res <- lm(pss ~ age + smoke + rses, data=sub) summary(res) # set up predictor matrix predMatrix <- make.predictorMatrix(sub) predMatrix # set up imputation methods vector impMethod <- make.method(sub) impMethod # run mice imp <- mice(sub, method = impMethod, predictorMatrix = predMatrix, seed = 12345) # fit model in each imputed dataset fit <- with(imp, lm(pss ~ age + smoke + rses)) # pool results pool <- pool(fit) summary(pool) # https://stefvanbuuren.name/fimd/ ############################################################################ # some other techniques/packages/functions: # - generalized estimating equations: gee, geepack # - time series: ts(), arima() # - bootstrapping: boot, bootstrap # - CFA/SEM: sem, lavaan # - machine learning: nnet, rpart, lasso2, lars, glmnet, elasticnet, randomForest, ... # # non-parametric methods: # - cor() with arguments method="kendall" or method="spearman" # - wilcox.test() – Wilcoxon rank sum and signed rank tests # - kruskal.test() – Kruskal-Wallis rank sum test # - friedman.test() – Friedman rank sum test ############################################################################