############################################################################ # restart the R session (Menu 'Session' - Restart R) # make sure the working directory is set to the directory/folder where the # script and the data are stored; if not, first set it (see rcode01) # reading in a rectangular tab-delimited plain-text data file # - header=TRUE : first row of the file gives the variable names # - sep="\t" : tab is the separator # - as.is=TRUE : don't convert strings to factors # - na.strings="" : blank values are interpreted as NA dat <- read.table("data_survey.dat", header=TRUE, sep="\t", as.is=TRUE, na.strings="") # note: as long as you don't get an error message, data were read in # illustrate an error (note that object 'tmp' is not created) tmp <- read.table("data_survey.dat", header=TRUE, sep=" ", as.is=TRUE, na.strings="") # see data_survey.pdf for variable info / coding manual # note: tab-delimited data can be easily exported from other software # note: it is possible to read in Stata, SPSS, SAS, Excel, etc. files # directly, but will not get into this for now; see the R Data Import/Export # manual: https://cran.r-project.org/doc/manuals/R-data.html # in RStudio, you can also use Menu 'File' - Import Dataset # - if you use this, make sure you copy-paste the code to your script # - also you may want to adjust the name of the object this creates # inspecting large datasets by just printing them isn't very useful dat # there is also a (default) maximum to the amount of values that are printed # (in RStudio, this is set to 1000; in R itself, this is set 99999); we can # manually change this with the options() function options(max.print = 99999) # now RStudio shows all data dat # but if there are many variables, then these are wrapped, which is also # confusing; instead, large datasets are more easily inspected with View() View(dat) # in RStudio, can also click on 'dat' in the 'Environment' pane (top right) # other ways to inspect the data str(dat) head(dat) head(dat, 10) tail(dat) ncol(dat) nrow(dat) dim(dat) names(dat) summary(dat) # get more information on a quantitative variables mean(dat$age) sd(dat$age) median(dat$age) quantile(dat$age, probs=c(0.05,0.95)) IQR(dat$age) min(dat$age) max(dat$age) range(dat$age) max(dat$age) - min(dat$age) summary(dat$age) fivenum(dat$age) table(dat$age) # what if there are missings? mean(dat$smokenum) mean(dat$smokenum, na.rm=TRUE) ############################################################################ # a digression on getting help: how do we know that the mean() function has # such an argument? read the help file! help("mean") # these also usually work help(mean) ?mean # in RStudio, can put cursor on command and hit F1 # the structure of help files: # # - Description = what does the function do? # - Usage = structure of the function (arguments) # - Arguments = what exactly do the arguments do? # - Details = additional details on the function # - Value = what output does the function produce? # - References = any references related to the function # - See Also = related functions # - Examples = examples! # # note: Details, Value, See Also, and Examples are optional # # also note that arguments often have 'default' values # note: do not expect to understand everything on the help pages (they can be # confusing / technical at times); the goal is to improve your understanding # of a function over time # help() is a function as well :) help(help) ############################################################################ # note that the first argument for mean() is called 'x'; so if we want to be # very explicit, we should use the following syntax mean(x=dat$smokenum, na.rm=TRUE) # some people prefer to put spaces before and after = mean(x = dat$smokenum, na.rm = TRUE) # the order of the arguments doesn't matter mean(na.rm=TRUE, x=dat$smokenum) # but R can also do 'positional matching' of arguments; all *non-named* # arguments will be matched by their position; so, the following also works mean(dat$smokenum, na.rm=TRUE) mean(na.rm=TRUE, dat$smokenum) # what will the following do? (check the help file for the 'mean' function) mean(na.rm=TRUE, dat$smokenum, 0.05) # recommendation: don't rely on positional matching too much, as this can be # confusing (I usually don't name the very first argument, but otherwise # explicitly name the other arguments) # note: don't have to fully write out argument names (as long as this is # unambiguous); so the following also works fine mean(dat$smokenum, na=TRUE) # recommendation: don't rely on abbreviated argument names too much (and this # isn't really necessary with the tab completion functionality of RStudio) # get mean of multiple variables colMeans(dat[c("age", "smokenum")]) colMeans(dat[c("age", "smokenum")], na.rm=TRUE) # combine some functions with comparisons (i.e., logicals) dat$age > 30 table(dat$age > 30) sum(dat$age > 30) mean(dat$age > 30) mean(dat$age > 30) * 100 round(mean(dat$age > 30) * 100, digits=2) # note: TRUE is treated as 1, FALSE is treated as 0 # number of missing values in a variable sum(is.na(dat$smokenum)) table(is.na(dat$smokenum)) # frequency table (by default) does not show the number of missing values table(dat$smokenum) # can show the number of missing values (if there are any or show always) table(dat$smokenum, useNA="ifany") table(dat$smokenum, useNA="always") # mean number of cigarettes smoked by the smokers mean(dat$smokenum, na.rm=TRUE) # not correct, because 'smokenum' = 0 for non-smokers dat$smoke == "yes" dat$smokenum[dat$smoke == "yes"] mean(dat$smokenum[dat$smoke == "yes"], na.rm=TRUE) # alternatively subset(dat, smoke == "yes", select=smokenum, drop=TRUE) mean(subset(dat, smoke == "yes", select=smokenum, drop=TRUE), na.rm=TRUE) # a more complex example dat$smokenum[dat$smoke == "yes" & dat$age >= 30] mean(dat$smokenum[dat$smoke == "yes" & dat$age >= 30], na.rm=TRUE) # not sure if this is easier mean(subset(dat, smoke == "yes" & age >= 30, select=smokenum, drop=TRUE), na.rm=TRUE) # frequency table of a categorical variables table(dat$smoke, useNA="always") table(dat$marital, useNA="always") # recode items as needed (see 'data_survey.pdf') dat$lotr2 <- 6 - dat$lotr2 dat$lotr4 <- 6 - dat$lotr4 dat$lotr5 <- 6 - dat$lotr5 dat$mastery1 <- 5 - dat$mastery1 dat$mastery3 <- 5 - dat$mastery3 dat$mastery4 <- 5 - dat$mastery4 dat$mastery6 <- 5 - dat$mastery6 dat$mastery7 <- 5 - dat$mastery7 dat$pss4 <- 6 - dat$pss4 dat$pss5 <- 6 - dat$pss5 dat$pss7 <- 6 - dat$pss7 dat$pss8 <- 6 - dat$pss8 dat$rses3 <- 5 - dat$rses3 dat$rses5 <- 5 - dat$rses5 dat$rses8 <- 5 - dat$rses8 dat$rses9 <- 5 - dat$rses9 dat$rses10 <- 5 - dat$rses10 # compute a scale total dat$lotr <- dat$lotr1 + dat$lotr2 + dat$lotr3 + dat$lotr4 + dat$lotr5 + dat$lotr6 head(dat) # but this can be a lot of typing when there are many items dat$lotr <- NULL # a nice trick if the item names have a common substring names(dat) grep("lotr", names(dat)) dat[grep("lotr", names(dat))] dat$lotr <- rowSums(dat[grep("lotr", names(dat))]) head(dat) # do the same for the other scales dat$mastery <- rowSums(dat[,grep("mastery", names(dat))]) dat$pss <- rowSums(dat[,grep("pss", names(dat))]) dat$rses <- rowSums(dat[,grep("rses", names(dat))]) # PANAS has two subscales, one for positive and one for negative affect # the trick above cannot be used (grepping for "panas" returns all items) # so have to do this the tedious way dat$posaff <- dat$panas1 + dat$panas4 + dat$panas6 + dat$panas7 + dat$panas9 + dat$panas12 + dat$panas13 + dat$panas15 + dat$panas17 + dat$panas18 head(dat) # can save a bit of typing by using the with() command dat$posaff <- with(dat, panas1 + panas4 + panas6 + panas7 + panas9 + panas12 + panas13 + panas15 + panas17 + panas18) dat$negaff <- with(dat, panas2 + panas3 + panas5 + panas8 + panas10 + panas11 + panas14 + panas16 + panas19 + panas20) # saving as a tab-delimited plain-text data file # - row.names=FALSE : do not add row names (it's not a variable and can lead # to problems when reading in the data in other software) # - quote=FALSE : do not put "" around strings # - sep="\t" : tab is the separator # - na="" : use a blank value for missing (NA) values write.table(dat, file="data_survey_edit.dat", row.names=FALSE, quote=FALSE, sep="\t", na="") # when saving this way, it should be fairly unproblematic to read in the data # in other software; note: never overwrite the original data file! # one can also save data (and any other object) in R's own file format save(dat, file="data_survey_edit.rdata") # advantages of .rdata files: # - data are compressed # - saving/loading large data files should be quicker # - all properties of the data/objects are exactly preserved # disadvantage: # - cannot read in data in other software ############################################################################ # remove 'dat' from workspace rm(dat) # list objects in workspace ls() # load data using read.table() dat <- read.table("data_survey_edit.dat", header=TRUE, sep="\t", as.is=TRUE, na.strings="") ls() head(dat) # remove 'dat' from workspace rm(dat) # load data load("data_survey_edit.rdata") ls() head(dat) # note: here we don't assign what we load to an object; instead, the object # name(s) are the same as what we saved (note: an .rdata file can contain # many objects, not just a single data frame) ############################################################################