############################################################################ # Open Online R Stream (https://www.wvbauer.com/doku.php/live_streams) # # By: Wolfgang Viechtbauer (https://www.wvbauer.com) # Date: 2022-02-24 # # Topic(s): # - An Introduction to Statistical Learning (https://www.statlearning.com) # - Section(s): 2.3 # # last updated: 2022-03-05 ############################################################################ ### basic commands # create a vector of numbers x <- c(1, 3, 2, 5) x # can also use = to do so x = c(1, 6, 2) x # another vector y = c(1, 4, 3) # check the length of both vectors length(x) length(y) # add up the elements in each vector elementwise x + y # list objects in your 'workspace' ls() # remove objects x and y rm(x, y) # list objects in your 'workspace' ls() # remove all objects in your workspace rm(list = ls()) # open the documentation of the matrix function ?matrix # this is a shortcut for using help help("matrix") # create a 2x2 matrix with the numbers 1-4 x <- matrix(data = c(1, 2, 3, 4), nrow = 2, ncol = 2) x # can also leave out the argument names x <- matrix(c(1, 2, 3, 4), 2, 2) # but note: this can be confusing, so only do this for the first argument x <- matrix(c(1, 2, 3, 4), nrow = 2, ncol = 2) x # fill in the numbers 'by row' matrix(c(1, 2, 3, 4), nrow = 2, ncol = 2, byrow = TRUE) # take the square-root of each element in x sqrt(x) # square each element in x x^2 # generate 50 random values from a standard normal distribution # in other words: x_i ~ N(0,1) for i = 1, ..., 50 x <- rnorm(50) x # add to the elements in x random values from a normal distribution with mean # 50 and standard deviation 0.1 y <- x + rnorm(50, mean = 50, sd = 0.1) y # correlation between x and y cor(x, y) # set the seed of the random number generator to make things reproducible; # note that both sets of 50 values are exactly the same set.seed(1303) rnorm(50) set.seed(1303) rnorm(50) # generate 100 values from a standard normal distribution set.seed(3) y <- rnorm(100) # get the mean, variance, and standard deviation of the values in y mean(y) var(y) sqrt(var(y)) sd(y) ############################################################################ ### graphics # simulate some data x <- rnorm(100) y <- rnorm(100) # create a scatterplot of x versus y plot(x, y) # add axis labels and a title plot(x, y, xlab = "this is the x-axis", ylab = "this is the y-axis", main = "Plot of X vs Y") # save a figure to a pdf pdf("figure.pdf") plot(x, y, col = "green") dev.off() # seq() for creating sequences x <- seq(1, 10) x # can also use this syntax x <- 1:10 x # for other types of sequences, can use the 'length' or 'by' arguments seq(0, 1, length = 11) seq(0, 10, by = 2) # contour plots x <- seq(-pi, pi, length = 50) x y <- x f <- outer(x, y, function(x, y) cos(y) / (1 + x^2)) contour(x, y, f) # use argument 'nlevels' to adjust the number of contour levels in the plot # and 'add=TRUE' means to add the contours to an already open contour plot contour(x, y, f, nlevels = 45, add = TRUE) # another (silly) example fa <- (f - t(f)) / 2 contour(x, y, fa, nlevels = 15) # filled contour plot using the viridis color palette filled.contour(x, y, fa, nlevels = 15, color.palette = hcl.colors) # or we can use the image() function for this image(x, y, fa) # better use the viridis color palette here as well image(x, y, fa, col = hcl.colors(12)) # perspective plot; use arguments 'theta' and 'phi' to change the points from # which we look at the 3-d surface persp(x, y, fa) persp(x, y, fa, theta = 30) persp(x, y, fa, theta = 30, phi = 20) persp(x, y, fa, theta = 30, phi = 70) persp(x, y, fa, theta = 30, phi = 40) persp(x, y, fa, theta = 30, phi = 40, col = "lightblue") persp(x, y, fa, theta = 30, phi = 40, col = "lightblue", shade = 0.4) ############################################################################ ### indexing data # create a 4x4 matrix A <- matrix(1:16, nrow = 4, ncol = 4) A # give me from A the element in row 2 and column 3 A[2,3] # give me the submatrix of A with rows 1 and 3 and columns 2 and 4 A[c(1,3), c(2,4)] # rows 1-3 and columns 2-4 A[1:3, 2:4] # give me rows 1 and 2 and all columns A[1:2,] # give me all rows and columns 1 and 2 A[,1:2] # careful: when you take a single row, it turns into a vector A[1,] # to avoid that this happens, use drop=FALSE A[1,,drop=FALSE] # same issue when taking just a single column A[,1] A[,1,drop=FALSE] # give me the submatrix of A with rows 1 and 3 removed A[-c(1,3),] # remove rows 1 and 3 and columns 1, 3, and 4 A[-c(1,3), -c(1,3,4)] A[-c(1,3), -c(1,3,4), drop=FALSE] # get the dimensions of the matrix A dim(A) ############################################################################ ### loading data # 1) download the dataset from here: https://www.statlearning.com/s/Auto.data # and here: https://www.statlearning.com/s/Auto.csv # can also use R to download the files #download.file("https://www.statlearning.com/s/Auto.data", destfile="Auto.data") #download.file("https://www.statlearning.com/s/Auto.csv", destfile="Auto.csv") # 2) put the datasets into the same directory/folder as this R script # 3) set the working directory to the location of this R script; you can do # this with RStudio (menu 'Session', Set Working Directory, To Source File # Location, which sets the working directory with the setwd() function to # the currently opened R script); if your computer is configured so that R # scripts are automatically opened in RStudio, then RStudio will set the # working directory automatically (this only works when RStudio is not # already running) # read in the data from Auto.data using the read.table() function Auto <- read.table("Auto.data") # show the first 6 rows of the dataset head(Auto) # get a spreadsheet-like view of the dataset View(Auto) # first row is a header row, missings are indicated with ? Auto <- read.table("Auto.data", header = TRUE, na.strings = "?") head(Auto) # read in a csv file (note: header=TRUE is the default for read.csv()) Auto <- read.csv("Auto.csv", na.strings = "?") head(Auto) dim(Auto) Auto[1:4, ] # remove any row that has at least one missing value Auto <- na.omit(Auto) dim(Auto) # get the variable names of the dataset names(Auto) ############################################################################ ### additional graphical and numerical summaries # this will generate an error because R does not know where to find variables # 'cylinders' and 'mpg' plot(cylinders, mpg) # can use the 'dollar notation' to specify where to find these variables plot(Auto$cylinders, Auto$mpg) # we are not going to use attach() as in the book, because this is not good # practice; instead, we could use the 'data' argument to avoid the repeated # use of Auto$ plot(mpg ~ cylinders, data = Auto) # or could use with() with(Auto, plot(cylinders, mpg)) # turn the cylinders variable into a factor Auto$cylinders <- factor(Auto$cylinders) # could use plot() as in the book, but to be explicit, create boxplots with # the boxplot() function boxplot(mpg ~ cylinders, data = Auto) boxplot(mpg ~ cylinders, data = Auto, col = "red") boxplot(mpg ~ cylinders, data = Auto, col = "red", varwidth = TRUE) boxplot(mpg ~ cylinders, data = Auto, col = "red", varwidth = TRUE, horizontal = TRUE) boxplot(mpg ~ cylinders, data = Auto, col = "red", varwidth = TRUE, xlab = "Number of Cylinders", ylab = "Miles per Galon") # histograms hist(Auto$mpg) hist(Auto$mpg, xlab = "Miles per Galon", main = "") hist(Auto$mpg, col = "lightblue") hist(Auto$mpg, col = "lightblue", breaks = 15) # note: breaks = 15 is a 'suggestion' for the number of breakpoints, so we are # not guaranteed to get exactly 15 breakpoints (and hence 14 bins); we can # also explicitly set the exact location of the breakpoints, which guarantees # that we will get 14 bins; note that the lowest and highest bins are empty # (i.e., they have a frequency of 0) but there are now exactly 14 bins with(Auto, hist(mpg, col = "lightblue", breaks = seq(5, 50, length=15))) # scatterplot matrix pairs(~ mpg + displacement + horsepower + weight + acceleration, data = Auto) pairs(~ mpg + displacement + horsepower + weight + acceleration, data = Auto, pch = 19, cex = 0.5) # the identify function allows you to interactively identify data points; # left-click on points to label them, right-click to stop with this; the # position of the select points is returned, so we can then examine these rows # in the data frame plot(mpg ~ horsepower, data = Auto) rows <- with(Auto, identify(horsepower, mpg, labels = name)) rows Auto[rows,] # once we have identified some particular points to label, can also do the # labeling manually using the text() function (pos = 4 means to place the # labels to the right of the points) rows <- c(103, 153, 331) plot(mpg ~ horsepower, data = Auto) with(Auto, text(horsepower[rows], mpg[rows], name[rows], pos = 4)) # a quick overview of all variables in a data frame summary(Auto) # get summary statistics of a single numeric variable summary(Auto$mpg) # for factor variables, summary() gives a frequency table summary(Auto$cylinders) ############################################################################