Chapter 5: Introduction to presenting statistical analyses using R
Box 5.3 Assessing non-response bias
Box 5.3 Code
Click here to show code with comments
# Tell R where you will be working. R will look here for your data. # You could also add the path to the load statements below. setwd("~/Documents/Books/Presenting/data/rData/") # Load the dataset rhinitis out of the current folder/directory into R's # interactive working memory/environment. See chapter 2 for more details. load("nonResponse.RData") # The factor() function below below replaces the character string variable # called race in the nonResponse dataset with a variable, also called race. R # will treat the new variable as an ordered set of categories when it does # statistics and graphics. # The factor() function uses a comma-delimited set of details. Programmers call # the details that a function understands "arguments". Notice the ( following # the word factor and the commas ending the lines with the arguments until # the closing ) . nonResponse$race <- factor(nonResponse$race, # The c() function can glue together a set of similar # things, for example, a bunch of numbers, to make a # vector. The levels argument is expecting a vector. # Remember that the c() function stores similar # things as a vector. In this case, the vector is # storing character strings that are the actual # values of the race variable. levels = c("White", "African origin", "Asian"), order = TRUE ) nonResponse$sex <- factor(nonResponse$sex, levels = c("Male", "Female"), order = TRUE ) # The with() function, used below, allows you to work with the variables within # a data frame without having to repeatedly mention the data frame. The code # below could have been written as: # table(nonResponse$race, nonResponse$sex) # # Another option is to attach the data frame at the "top" of the search list with # code like this: # attach(nonResponse) # table(race, sex) # detach(nonResponse) # That strategy can work if you are only working with a single data frame but it # can lead to problems. Problems will ensue if you do not detach the data when # you have variables with the same name in multiple data frames. For example, # if you are working to predict who has a sexually transmitted infection and # if you have a variable called sex in a data frame called coitus and another # variable called sex in a data frame called demographics, you can easily # accidentally use a person's gender when you intended to use an indicator for # sexual activity. Look here a good brief introduction: # http://www.r-bloggers.com/to-attach-or-not-attach-that-is-the-question/ # Make a summary table of the race and sex variables found in the nonResponse # data frame . with(nonResponse, # Use the lbp dataset. # The table() function makes frequency count tables. Here it will make a table # for the 3 levels of race and the 2 levels of sex. table(race, sex) )
Box 5.4 Population profile from a report
Box 5.4 Code
Click here to show code with commentssetwd("~/Documents/Books/Presenting/data/rData/") load("lbp.RData") lbp$time <- factor(lbp$time, levels = 1:5, # shorthand for c(1, 2, 3, 4, 5) labels = c("Less than 1 week", "1 week to < 4 weeks", "4 week to < 8 weeks", "8 week to < 6 months", "6 months and over" ), order = TRUE ) lbp$pain <- factor(lbp$pain, levels = 1:6, labels = c("No pain at all", "Little pain", "Moderate pain", "Quite bad pain", "Very bad pain", "Almost unbearable pain" ), order = TRUE ) with(lbp, # Use the lbp dataset. # The na.omit() function returns the non missing data. To learn about how # it handles missing data look here: # http://www.ats.ucla.edu/stat/r/faq/missing.htm table(na.omit(time)) # Make a frequency table after removing missing data. ) with(lbp, # Calculate percentages by dividing the counts of a frequency table # divided by the number of not missing elements and multiply by 100. # The round() function takes two arguments. The first is a number and # the second is the number of decimal places. Here, round returns a # number with 0 decimal places. The length function says how many items # were found. Here it counts the time assessments after the missing times # are removed. round(100 * table(na.omit(time))/length(na.omit(time)), 0 ) ) with(lbp, table(na.omit(pain)) ) with(lbp, round(100 * table(na.omit(pain))/length(na.omit(pain)), 0 ) )
Table 5.1 Single Concise table from a paper with adjusted and unadjusted estimates
Table 5.2 Table suitable for an oral presentation
Table 5.1 and 5.2 Code
Click here to show code with commentssetwd("~/Documents/Books/Presenting/data/rData/") load("vaginosis.RData") # The library() function attaches a package full of functions (and sometimes # data frames) to R's "search list". When you type riskratio(), R searches that # list to figure out what to do. If you have not attached the package that # included a riskratio() function, R will complain that it can not find the # function "riskratio". # You can encounter problems when two packages include objects with the same # name. R will give a warning like: # The following object(s) are masked ... # when you attach a package that includes a function that is already in the # search list. R calls this problem masking. The detach() function, shown # below, can be used to remove packages from the search list and help avoid this # problem. library(epitools) # The riskratio() function in the epitools package shows a risk ratio with # normal approximation confidence intervals and p values. To learn more, type # ?riskratio after typing library(epitools) and look at the details section of # the help file. with(vaginosis, riskratio(ageunder25, bvyes)) # predictor outcome 1 event with(vaginosis, riskratio(blackethnic, bvyes)) with(vaginosis, riskratio(socialclass3to5, bvyes)) with(vaginosis, riskratio(single, bvyes)) with(vaginosis, riskratio(pill, bvyes)) with(vaginosis, riskratio(noprevpreg, bvyes)) with(vaginosis, riskratio(top, bvyes)) detach("package:epitools", unload = TRUE) # prevent masking conflicts
Color coding Color coding created by Pretty R at inside-R.org
Figure 5.2 Several binary variables on one graph suitable for a poster or talk
Figure 5.2 Code
Click here to show code with commentssetwd("~/Documents/Books/Presenting/data/rData/") load("vaginosis.RData") # The function() function is used to make a new function called rnd() that will # be used below. To start to learn about writing R functions look here: # http://www.r-bloggers.com/how-to-write-and-debug-an-r-function/ rnd <- function(x) { # To understand how the function works, remove the leading # in the four # lines of code below and run the code one line at a time: # # table(vaginosis$bvyes) # Make a frequency count. # prop.table(table(vaginosis$bvyes)) # Convert the count to percentages. # prop.table(table(vaginosis$bvyes))[2] # Only keep the 2nd number. # round(prop.table(table(vaginosis$bvyes))[2], 2) # Round it to 2 decimals. # # That is, use the table() function to make a frequency table on x; turn that # into a table of percentages by using the prob.table() function. Type # ?prop.table and run the example code to see what it does. Take the # 2nd percentage (the yes category), multiply that by 100 and use the round() # function to round it to two decimal places. round(100 * prop.table(table(x)), 2 ) } # Apply the rnd() function to the bvyes variable for everyone in the vaginosis # dataset. allScore <- with(vaginosis, rnd(bvyes)) allScore # Print the rounded percentage. # Apply the rnd() function to the bvyes variable for everyone who has ageunder25 # equal to 1 in the vaginosis dataset. The subset() function takes a data frame # as its first argument and a "logic check" argument, which is actually called # subset, which is used to determine which records/rows to include. ageunder25Score <- with(subset(vaginosis, ageunder25 == 1), rnd(bvyes)) blackethnicScore <- with(subset(vaginosis, blackethnic == 1), rnd(bvyes)) socialclass3to5Score <- with(subset(vaginosis, socialclass3to5 == 1), rnd(bvyes)) singleScore <- with(subset(vaginosis, single == 1), rnd(bvyes)) topScore <- with(subset(vaginosis, top == 2), rnd(bvyes)) # The data.frame() function can be used to convert (technically it is called # coerce) its arguments into being a new data frame . Here, two vectors are # created. The first, called what, is holding character strings. The second # vector, called per, is holding numbers. They are glued together to form a # data frame called theFrame that has two columns and six rows. theFrame <- data.frame(what = c("All women (1201)", "Under 25 (150)", "Afro-Caribbean (116)", "Social Class 3-5 (415)", "Single (94)", "Previous termination (207)" ), per = c(allScore, ageunder25Score, blackethnicScore, socialclass3to5Score, singleScore, topScore ) ) # The barplot() function draws bar charts. It can use many arguments, some of # which are shown below, to control the details of what is shown. To learn # more, type ?barplot and look at the Usage and Examples section. Also look # here: http://www.statmethods.net/graphs/bar.html barplot(theFrame$per, names.arg = theFrame$what )
Figure 5.3 Graph for an ordered categorical variable suitable for poster or talk
Figure 5.3 Code
Click here to show code with commentssetwd("~/Documents/Books/Presenting/data/rData/") load("lbp.RData") lbp$pain <- factor(lbp$pain, levels = 1:6, labels = c("No pain at all", "Little pain", "Moderate pain", "Quite bad pain", "Very bad pain", "Almost unbearable pain" ), order = TRUE ) barplot(with(lbp, # Calculate percentages by dividing the counts of a frequency # table divided by the number of not missing elements and multiply # by 100. Round results to 0 decimal places. round(100 * table(na.omit(pain))/length(na.omit(pain)), 0 ) ), ylab = "Percentage of patients" # An axis label for the barplot function. ) # The title() function can add titles to many common plots including graphics # created by the plot() and barplot() function. To learn more look here: # http://www.statmethods.net/advgraphs/axes.html title(main = "Level of pain experienced at the time of completion of the recruitment questionnaire (N=555)")
Figure 5.4 Graph comparing two groups of patients suitable for poster or talk
Figure 5.4 Code
Click here to show code with commentssetwd("~/Documents/Books/Presenting/data/rData/") load("xray.RData") xray # Notice there is one record per observation. That is, the data is in # long/narrow format. The bar plot will want the data reorganized to have # one record per result. That is, the data should be in wide format. This # can be done with many functions. The unstack() function is perhaps the # simplest. x <- unstack(xray, percent ~ when # The percents are shown in the body of the new # data frame . The new column variables are based on # the "when" variable. ) # The row.names() function can give labels for the rows in a data frame. row.names(x) = c("Referred", "Not Referred") # The reshape package has a function called rename() to easily rename variables. # That function uses two arguments, the data frame and the details on the # change. library(reshape) x <- rename(x, # In the x data frame , rename the X1 variable to appear with label. c(X1 = "Less than 1 week") ) x <- rename(x, c(X2 = "1 week to < 8 weeks")) x <- rename(x, c(X3 = "8 week to < 6 months")) x <- rename(x, c(X4 = "6 months and over")) # You can run into warnings/issues if two R packages have the same function. # The function reshape() is used in several packages which are commonly used. # So, it is good hygiene to unload it when you are done. detach("package:reshape", unload = TRUE) # Prevent conflicts with rename function. barplot(as.matrix(x), # Convert the x data frame to be a matrix. legend.text = row.names(x), beside = TRUE, # Do a side by side bar chart. col = c("grey", "red"), # The bar colors args.legend = list(bty = "n"), # Put no box around the legend/key. ylab = "Percentage of patients" ) title(main = "Length of present episode for patients referred for X-ray in observational study (N=427)")
Figure 5.5 Calculating the confidence interval of a geometric mean
Figure 5.5 Code
Click here to show code with commentssetwd("~/Documents/Books/Presenting/data/rData/") load("hdl.RData") # Load the epiDisplay package for the ci() function. library(epiDisplay) # The ci() function calculates the confidence limits and other statistics and # save them into a data frame named stats. This is done so the values can be # exponentiated below. stats <- ci(hdl$loghdl) stats # Print the values. exp(stats[c(2, 5, 6)]) # Do the exponential function on the 2nd, 5th, and 6th # objects (columns) in the stats data frame . detach("package:epiDisplay", unload = TRUE)
Color coding Color coding created by Pretty R at inside-R.org
Table 5.3 Presenting several types of variables with a common theme in one table
Table 5.3 Code
Click here to show code as textR code
Figure 5.6 Using a graph to compare a distribution with a cut-off
Figure 5.6 Code
Click here to show code with commentssetwd("~/Documents/Books/Presenting/data/rData/") load("pvd.RData") # Make a true/false indicator variable in the pvd data frame for values. # >= 140 will be true pvd$high <- (pvd$sbp >= 140) # The title for the next graphic is extemely long. The entire thing can be # displayed on a big monitor but to have the code display on small monitors, the # parts of the title are written as three separate quoted strings and glued # together using the paste() function. theTitle <- paste("Distribution of systolic blood pressure from the Wandsworth", "Heart and Stroke Study in 1575 patients showing the", "proportion with high blood pressure (\u2265 140 mm Hg)" ) # The ggplot2 library contains the commonly used ggplot() and qplot() functions # which can be used to make simple and very complex graphics. See chapter 2 # for more details. library(ggplot2) ggplot(pvd, aes(x = sbp, # The x axis attribute will be the sbp variable. fill = high) # The fill color will be based on the high variable. ) + geom_histogram( # Say this is a histogram. aes(y = ..density..), # Use the density for the y axis. colour = "black", # Set the bar outline colors. binwidth = 10) + # Set the width of the bars. theme_bw() + # Start with a black and white graphic. scale_fill_manual(values = c("gray","red")) + # Set the fill colors. xlab("Systolic blood pressure mm Hg") + ylab("Proportion of patients") + theme(legend.position = "none") + ggtitle(theTitle) + # Add the 161 charter wide title. scale_x_continuous(breaks = seq(40, 300, by = 20) )