############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2023-05-11 # # Topic(s): # - An Introduction to R # https://cran.r-project.org/doc/manuals/r-release/R-intro.html # - Section(s): 3.1 - 4.3 # # last updated: 2024-02-23 ############################################################################ ### 3: Objects, their modes and attributes ############################################################################ ## 3.1: Intrinsic attributes: mode and length # create a numeric vector and check its 'mode' x <- c(1, 3, 2, 4, 5, 3) mode(x) # sidenote: the mode() function in R does not give the 'statistical mode' of # the data (i.e., the value that appears most often in a set of data values); # see: https://en.wikipedia.org/wiki/Mode_(statistics) # check if 'x' is a vector is.vector(x) # check if 'x' is an 'atomic vector' is.atomic(x) # try to mix numerical values with a string in the same vector x <- c(1, 3, 2, "Bob", 4, 5) x mode(x) is.vector(x) is.atomic(x) # everything has turned into a character string, so we have a character # vector; R uses various rules for doing this sort of 'type coercion' # create a numeric vector with a missing value x <- c(1, 3, 2, NA, 5, 3) mode(x) # the 4th value is missing and it is a numeric missing value x[4] # in R, this can be explicitly stated as NA_real_ # a character vector with a missing value x <- c("Bob", "Sue", NA, "Gil") mode(x) # the 3rd value is missing and it is a character missing value x[3] # in R, this can be explicitly stated as NA_character_ # list all elements in the workspace ls() # remove x rm(x) # list all elements in the workspace ls() # character(0) indicates a character vector with no elements, so this shows # that there are no elements in the workspace # create a list with three elements l <- list(x = c(1,3,2,5), y = c("Bob","Sue"), z = 42) l # the mode of a list is 'list' (surprise!) mode(l) # a list is defined to be a vector in R, but it is not an atomic vector is.vector(l) is.atomic(l) # create the numeric vector 'x' and obtain the five-number summary of its values x <- c(1, 3, 2, 4, 5, 3, 3, 4, 1, 2, 3, 1, 5, 4, 5, 8, 4, 5) fivenum(x) # fivenum() is a function in R; functions in R are also objects, which you can # inspect and manipulate (and we can easily create new ones) fivenum # can also check the mode of such an object mode(fivenum) # but (obviously) they are not (atomic) vectors is.vector(fivenum) is.atomic(fivenum) # use length() to find the number of elements in a vector length(x) # for a list, length() tells us about the number of elements in the list length(l) # the mode and length of an objects are intrinsic attributes # an object can also have other attributes; for example, lists can have an # attribute called 'names' which gives the names of the list elements attributes(l) # numeric vectors can be of different types x <- c(2.4, 1.8, 5.7, 3.9) x # x is a numeric vector mode(x) # but it is a collection of numbers that have a fractional part; in R, this # means that this numeric vector is of type 'double' typeof(x) # we can have another numeric vector that is of type 'integer'; for this, we # put L after each integer x <- c(2L, 1L, 5L, 3L) x mode(x) typeof(x) # if we don't use L, then the numeric vector is of type 'double' x <- c(2, 1, 5, 3) x mode(x) typeof(x) # cannot mix doubles and integers in the same vector x <- c(2, 1, 5L, 3) x mode(x) typeof(x) # if we try, then the integers get 'promoted' (type coercion!) to doubles # create a vector of the numbers 0, 1, ..., 9 z <- 0:9 z # when we use : to create a numeric sequence, it creates an integer vector typeof(z) # turn the numeric (integer) vector z into a character vector digits <- as.character(z) digits # this is doing 'explicit type coercion' (note that we already saw 'implicit # type coercion' earlier, for example when mixing numbers and strings in a # vector or when mixing doubles and integers in a vector) # can even turn a character vector into a numeric vector using as.numeric() # (note: this always creates a 'double' vector) d <- as.numeric(digits) d typeof(d) # there are also as.double() and as.integer() functions as.double(d) as.integer(d) ############################################################################ ## 3.2: Changing the length of an object # we can create an empty numeric vector with numeric() e <- numeric() e # it is of length 0 length(e) # but this empty vector still has a mode (and a type) mode(e) typeof(e) # similarly, can create empty character or logical vectors character() logical() # note: if you do specify the length of the vector, R initializes these # different vector types to different default values numeric(10) character(10) logical(10) # assign the value 17 to the third element of e; since right now, e is of # length 0, R will happily change the length of the vector to 3, but the first # and second element will be missing (NA) e[3] <- 17 e # in contrast to what is stated in the manual, can even do this when the mode # of the value that we are assigning to an element is different than the mode # of the vector itself, but implicit type coercion then happens e[5] <- "Bob" e # create a vector with 10 numbers alpha <- c(2, 4, 3, 1, 8, 5, 4, 5, 3, 7) alpha # select elements 1 through 5 from this vector and assign it on top of alpha alpha <- alpha[1:5] alpha # this effectively shortens alpha # we can also directly assign a length to a vector and if the length is # shorter than the current length, it drops the extra elements length(alpha) <- 3 alpha # can also assign a longer length than the current one; the additional new # elements then will be missing values (NAs) length(alpha) <- 10 alpha ############################################################################ ## 3.3: Getting and setting attributes # create a numeric vector and add some attributes to it x <- c(2, 4, 3, 1, 6, 8) attr(x, "date") <- "2023-05-11" attr(x, "subject") <- c("Bob", "Sue", "Joe", "Gill", "Tom", "Anna") x # while in this case, printing x directly shows all of its (non-intrinsic) # attributes, we can still use attributes() just to list them attributes(x) # we can extract a particular attribute in two different ways attr(x, "subject") attributes(x)$subject # depending on the type of object, there are some special types of attributes # that have a particular purpose; we can illustrate this using a 'named # vector' # create a named (numeric) vector with 4 elements x <- c(Bob = 2, Sue = 4, Joe = 5, Gill = 8) x mode(x) typeof(x) # the names of such a vector are actually an attribute attributes(x) # so the 'names' attribute is a special type of attribute that also changes # how the vector is printed x # another example of this is the 'dim' attribute # create a numeric vector with 6 elements x <- c(3, 2, 5, 7, 1, 3) x # assign c(3,2) as the 'dim' attribute to 'x' attr(x, "dim") <- c(3,2) # this turns the vector into a matrix with 3 rows and 2 columns x # list the attributes of x attributes(x) ############################################################################ ## 3.4: The class of an object # all of the elements in x are numeric mode(x) # but it is a matrix and it has that class (and also the class 'array') class(x) # illustrate how the class of an object can influence how a function behaves x <- c(2, 3, 2, 5, 3, 2, 4, 2, 3) x summary(x) # so if we use summary() on a numeric vector, it prints the five-number # summary (and also the mean) # turn x into a factor (which can take on the values (the so-called 'levels') # 1, 2, ..., 5) and assign this to y y <- factor(x, levels=1:5) y summary(y) # if we use summary() on a factor, it creates a frequency table # how does R know what to do when using summary() in these different cases? # this has to do with the class of the object class(x) class(y) # depending on the class of the object given to summary(), the function # behaves differently # sidenote: the 'levels' and 'class' of a factor are actually attributes attributes(y) # note that some objects have an 'implicit class' that is not an attribute class(x) attributes(x) # NULL means that there are no attributes for x # manually create a data frame with three variables dat <- data.frame(subject = c("Bob", "Sue", "Gill", "Tom"), age = c(25, 23, 28, 21), y = c(5, 6, 3, 4)) dat # the way a data frame is printed (remember: typing 'dat' is just a shortcut # for 'print(dat)') again has to do with the class of the object class(dat) # remove the class from dat unclass(dat) # when we do so, we see that a data frame is really just a list, where each # list element is a variable in the data frame # consider a more complex example where we fit a linear regression model with # 'y' as the outcome variable and 'age' as the predictor res <- lm(y ~ age, data=dat) res # when we print 'res', we just get the estimated intercept and slope; to get # the full regression table (and things like R^2), we can use summary() summary(res) # that summary() provides all of this output for a 'regression model object' # again has to do with the class of the object class(res) # remove the class from res unclass(res) # now we see that res is actually a list with a whole bunch of different # elements (some of which make sense, like 'coefficients' or 'residuals', but # there are others whose meaning/purpose is not so clear, but this is not # relevant for now) ############################################################################ ### 4: Ordered and unordered factors ############################################################################ ## 4.1: A specific example # create a character vector state <- c("tas", "sa", "qld", "nsw", "nsw", "nt", "wa", "wa", "qld", "vic", "nsw", "vic", "qld", "qld", "sa", "tas", "sa", "nt", "wa", "vic", "qld", "nsw", "nsw", "wa", "sa", "act", "nsw", "vic", "vic", "act") state # turn state into a factor and assign this to statef statef <- factor(state) statef # notice that when you print a factor, it also lists the levels of it; also, # since we did not use the 'levels' argument when creating the factor(), the # levels are automatically the unique values that occur in the data # can also request to list the levels with levels() levels(statef) ############################################################################ ## 4.2: The function tapply() and ragged arrays # create a numeric vector with the incomes incomes <- c(60, 49, 40, 61, 64, 60, 59, 54, 62, 69, 70, 42, 56, 61, 61, 61, 58, 51, 48, 65, 49, 49, 41, 48, 52, 46, 59, 46, 58, 43) # compute the mean income for each level of statef incmeans <- tapply(incomes, statef, mean) incmeans # can use any function in tapply() that takes a vector as input and produces a # single value as output (like the mean or the SD or whatever) tapply(incomes, statef, sd) # the output from tapply() is just a vector (with the means or SDs or # whatever), which we can collect in a data frame dat <- data.frame(mean = tapply(incomes, statef, mean), sd = tapply(incomes, statef, sd), n = tapply(incomes, statef, length)) dat # create a function called stdError() that computes the standard error of a mean stdError <- function(x) sqrt(var(x)/length(x)) # compute the standard error of the mean income for each statef level and add # this as a new variable to 'dat' dat$se <- tapply(incomes, statef, stdError) dat # add the 95% CI limits for the means to the data frame # see: https://en.wikipedia.org/wiki/Confidence_interval#Example dat$ci.lb <- dat$mean - qt(.975, df=dat$n-1) * dat$se dat$ci.ub <- dat$mean + qt(.975, df=dat$n-1) * dat$se dat # tapply() is even more flexible; the function that we apply to the values # within each state level can also yield multiple values; for example, we # earlier saw the fivenum() function tapply(incomes, statef, fivenum) # then tapply() returns a list where each element is the five-number summary # of a particular level of statef # by splitting the incomes variable by statef levels, we can see that this is # a 'ragged array' because the lengths of these vectors are not all the same split(incomes, statef) ############################################################################ ## 4.3: Ordered factors # create two character vectors name <- c("Bob", "Sue", "Tom", "Gill") speed <- c("slow", "fast", "medium", "slow") # turn speed into a factor (unordered) speedf <- factor(speed) speedf # note: the factor levels are determined alphabetically # we can explicitly set what is the first, second, and third level speedf <- factor(speed, levels=c("slow", "medium", "fast")) speedf # but speedf is still an unordered factor; we can make it an ordered factor in # two different ways speedf <- factor(speed, levels=c("slow", "medium", "fast"), ordered = TRUE) speedf speedf <- ordered(speed, levels=c("slow", "medium", "fast")) speedf # note how the levels are printed for an ordered factor # the information that speedf is now an ordered factor is contained in the # class of this object (which now actually has two classes) class(speedf) ############################################################################