############################################################################ # Course: Introduction to R # Author: Wolfgang Viechtbauer (https://www.wvbauer.com) # License: CC BY-NC-SA 4.0 # # last updated: 2024-01-23 ############################################################################ # restart the R session (Menu 'Session' - 'Restart R') ############################################################################ # most datasets are just a bunch of vectors (of the same length) combined into # what is called a 'data frame'; let's create such an object manually id <- c("Bob", "Sue", "John") age <- c(25, 21, 30) sex <- c("Male", "Female", "Male") grp <- c("Trt", "Trt", "Ctrl") dat <- data.frame(id, age, sex, grp) dat # notes: # - on the very left, we have the row names (not a variable!); by default, # they are just consecutive numbers, but don't have to be # - character variables are not shown with quotes # note: in the Environment pane, we now have a 'Data' object called 'dat' # # can click on 'dat' to view the contents (more useful for larger datasets) # # this is not for editing; we do not make any manual changes to our data! # this is the same as using the View() command View(dat) # note: R is case-sensitive view(dat) # an important point: objects are not linked age dat age <- c(35, 31, 45) age dat # if you change the 'age' object, then the 'age' variable in 'dat' is unchanged # R is not Excel ... # clean up the workspace a bit (keep things tidy!) ls() rm(id, age, grp, sex) ls() # accessing individual variables within a data frame dat$id dat$age dat$sex dat$grp # subsetting (value before the comma is the row, value after is the column) dat dat[1,2] # subsetting (columns) dat[,2] dat[,c(1,4)] # note: if you take a single column, you get a vector # for data frames, we can also use a special notation to select columns dat[2] dat[c(1,4)] # careful: with this notation, taking a single column returns a data frame # with that column (and not a vector with the values of that column) # can also refer to columns by their variable names dat[,"age"] dat[,c("id","grp")] dat["age"] dat[c("id","grp")] # subsetting (rows) dat[1,] dat[c(1,2),] # combine selection of rows and columns dat[1:2, c("id","grp")] # subsetting with logicals dat$sex dat$sex == "Male" dat[dat$sex == "Male",] # you can read this as: from 'dat', give me all rows where variable 'sex' from # 'dat' is equal to 'Male' # hence the following does not work dat[sex == "Male",] # have to be explicit where the variable for the comparison can be found dat[dat$sex == "Male",] # note: such subsetting will just return the dataset for the males, but this # is not a permanent selection; examine object 'dat' again dat # if you want to make a permanent selection, you have to assign this output to # an object (either overwrite the original object or make a new one) dat.m <- dat[dat$sex == "Male",] dat.f <- dat[dat$sex == "Female",] dat.m dat.f # note: in R, we can have an unlimited number of objects (including data # frames) available at the same time (see the Environment pane); this can get # confusing quickly, so try to keep your workspace tidy (i.e., remove objects # you no longer need) rm(dat.m, dat.f) # using the subset() command subset(dat, sex == "Male") subset(dat, age >= 25) subset(dat, grp == "Trt") # note: the subset() command is clever enough to look for the variables used # for the subsetting inside of the dataset itself! # subset() can also be used to select one or multiple columns subset(dat, select = c(age, sex)) # can also use this to subset rows and select columns at the same time subset(dat, sex == "Male", select = c(age, sex)) subset(dat, sex == "Male", select = age) subset(dat, sex == "Male", select = age, drop = TRUE) # drop = TRUE to turn the one column data frame into a vector # add a new variable to a data frame dat dat$y <- c(5, 7, 999) dat # make a copy of an object dat2 <- dat dat2 # sort a vector dat2$age sort(dat2$age) dat2 # as before, this is not a permanent change unless you 'back assign' it dat2$age <- sort(dat2$age) dat2 # note: this is NOT the right way to sort a data frame; it just sorts the age # variable within dat2, but all of the other variables are unchanged, so now # dataset dat2 is totally screwed up rm(dat2) # how to sort a data frame dat order(dat$age) # this means: the 2nd person has the lowest age, the 1st person has the next # higher age, and the 3rd person has the highest age # so we can use this order using the 'subsetting' notation introduced earlier dat[order(dat$age),] # now the entire dataset has been sorted correctly by age # but again, this is not permanent dat # to make this permanent, back-assign it dat <- dat[order(dat$age),] dat # value replacement (suppose 999 actually stands for missing data) dat$y dat$y==999 dat$y[dat$y==999] <- NA dat # note: the variable that is being changed does not have to be the same # variable that is used to select cases dat$y[dat$id == "John"] <- 8 dat # this way you can make corrections to a dataset dat$y[3] <- 8 dat # rename a variable dat names(dat) names(dat)[1] names(dat)[1] <- "subject" dat # but in large datasets, counting variable names to figure out the position of # the variable that you want to rename would be very tedious names(dat) names(dat) == "age" names(dat)[names(dat) == "age"] <- "years" dat # remove a variable from a data frame dat dat$y <- NULL dat # generate a new variable based on an existing one dat dat$days <- dat$years * 365 dat # sum/mean of several variables dat$y1 <- c(2, 4, 3) dat$y2 <- c(5, 5, 1) dat dat$ysum <- dat$y1 + dat$y2 dat$ymean <- (dat$y1 + dat$y2) / 2 dat # there are special functions for this dat$ysum <- NULL dat$ymean <- NULL dat dat[c("y1","y2")] rowSums(dat[c("y1","y2")]) rowMeans(dat[c("y1","y2")]) dat$ysum <- rowSums(dat[c("y1","y2")]) dat$ymean <- rowMeans(dat[c("y1","y2")]) dat # what if there are missing values? dat$y1[2] <- NA dat # then the resulting mean/sum will also be NA dat$ysum <- rowSums(dat[c("y1","y2")]) dat$ymean <- rowMeans(dat[c("y1","y2")]) dat # can avoid this with the 'na.rm' argument (set it to TRUE); then the mean or # sum is taken over the non-missing values within each row dat$ysum <- rowSums(dat[c("y1","y2")], na.rm=TRUE) dat$ymean <- rowMeans(dat[c("y1","y2")], na.rm=TRUE) dat # subsetting when there are missing values dat dat$y1 >= 2 dat[dat$y1 >= 2,] # not good :( # easier dat$y1 >= 2 which(dat$y1 >= 2) dat[which(dat$y1 >= 2),] # easiest subset(dat, y1 >= 2) ############################################################################ # quick summary of the bracket notation: # # say 'x' is a vector (could also be something like dat$x), then we can use [] # to select one or multiple elements from that vector (e.g., x[2] or x[1:3]) # # say 'dat' is a data frame, then we can use: # - dat[] to select one or more columns (e.g., dat[3] or dat["age"]) # - dat[row(s),column(s)] to select one or more rows and one or more columns # (e.g., dat[1:3,3] or dat[1:3,"age"]) # - when leaving out row(s) or column(s), then this means to select all rows # or columns (e.g., dat[1:3,] or dat[,"age"]) # # often we use 'logicals' for selection/subsetting (e.g., dat[dat$age > 21,]) ############################################################################ # a few other object types # matrices: similar to data frames (i.e., have rows and columns), but all # elements in a matrix must be of the same type (numeric, character, etc.) # arrays: similar to matrices but can have more than two dimensions # lists: a collection of objects (components); a list allows you to gather a # variety of (possibly unrelated) objects under one name # example of a list with 4 components w <- list(name="Fred", age=24, grades=c(7,8,6,9,5,6), address=c("14 Pine Ave, Nicetown", "104 South Street, Bad City")) w # note: data frames are really just a special case of lists, where each # component is of the same length # factors: a special data type for nominal variables gender <- c("Male","Male","Male","Male","Female","Male","Male","Female","Male") gender gender <- factor(gender) gender # internally, factors are stored as integers that are mapped to the levels # (here: 1 = Female, 2 = Male) # R now treats gender as a nominal variable summary(gender) ############################################################################