### R code

## R code that I have up and running*- especially those sets of functions that I find continuously useful - are shared here for friends, collaborators, and you.

Please forage freely.You are also welcome to leave a comment if you know of other great ways to tackle some of the tasks coded here.

*The code may not be perfect, but it should be free of major errors or misuses.

**Coded below:**

The Basics - starting and basic data manipulation

Go-to tests - Chi-Square, Anova, Means-comparison

Plotting

Mapping

GLM & GAM basics

Credits

### -----------------------THE BASICS --------------------------------------------------

###
1. Getting started

#Housekeeping

# start with these every time that you open R

rm(list=ls())# start with these every time that you open R

graphics.off()

**#Set your working directory**so that R knows where to find your files

**#**

**Import data**

#usually prepare and save data in a .csv file
(comma separated)

read.csv(“FileName.csv”, header=T) #if has
labels

#best to assign it to an object with = or <-

#Example: YourData =
read.csv("Data_forR.csv", sep=",", header=T)

**#**

**Check data**

head(YourData)

#This shows the first few lines of your data.

#It should look familiar.

**#**

**Export & save**

write.csv(YourData, file="YourFile.csv")

#saves to working directory

###
2. Manipulating data

**#**

**Classify data**

#inform R that a column of your data contains factors or numbers (as.numeric)

Column1$yourData <-
as.factor(Column1$youData)

# R does not automatically recognize dates, so here is a handy way

#to make sure it knows that your dates are dates and thus graphs #them in order

#If your dates are entered as 5/30/2015:

YourData$Date <-as.Date(YourData$Date, "%m/%d/%y")

# R does not automatically recognize dates, so here is a handy way

#to make sure it knows that your dates are dates and thus graphs #them in order

#If your dates are entered as 5/30/2015:

YourData$Date <-as.Date(YourData$Date, "%m/%d/%y")

**# Sorting data**

`#using the package `__plyr__

```
# use the arrange() function, then simply feed it your data & the ordering column
arrange(YourData, Date)
```

**#**

**Transposing your data**(long to wide)

#I usually do this so that each species has a column

#using the package

require(reshape2)

#Even though data is already long, need to melt it to long format

# so that program recognizes appropriate ID & measurement variables

#Basic syntax:

__reshape2__require(reshape2)

#Even though data is already long, need to melt it to long format

# so that program recognizes appropriate ID & measurement variables

#Basic syntax:

data.long<-melt(your.data,id.vars=c("variables",

"these become column names"),

measure.vars=("column that will become the data"))

#Example:

occ.long <- melt(TheData, id.vars=c("UniqueID" , "Species"), measure.vars=("Abundance"), na.rm=TRUE)

head(occ.long)

#now that it knows what's what, dcast() into wide format

#the formula creates the data frame format

#Basic syntax:

occ.long <- melt(TheData, id.vars=c("UniqueID" , "Species"), measure.vars=("Abundance"), na.rm=TRUE)

head(occ.long)

#now that it knows what's what, dcast() into wide format

#the formula creates the data frame format

#Basic syntax:

data.wide <- dcast(data.long, formula = staying var + staying #var ~ new column var, value.var="measurement")

#NOTE: with multiple observations, use fun.aggregate=sum to add them #together, as with juvenile and adult occurrences of same species

#Example:

occ.wide <- dcast(occ.long, formula = UniqueID ~ Species,

value.var="value", fun.aggregate = sum)

head(occ.wide)

head(occ.wide)

**#**

**Relativizing/standardizing (transformations)**

#For a matrix of abundances (rows) of species (columns)

#this divides by the row total to remove
abundance effects

#Using the package

__vegan__
require(vegan)

#syntax:

new.name <- decostand(data,
“total”)

#display the sum of row vectors to determine
proper scaling

# each should = 1

apply(new.name, 1, sum)

**#Set an a-priori intercept**from predictorfactor levels

data$Predictor <- factor(data$Predictor, levels = c("A", "B", "C"))

# first level listed becomes intercept

**#**

**Merge multiple data frames**with a common identifier

#I use this to merge my environmental parameters with

# species occurrence records, etc

# The identifier is any column containing unique identifying info that is in both

#data sets. Be sure that those column names match.

**#**

**Creating new identifier by merging**two existing ID columns

#using paste()

data$newID <- paste(data$ID1, data$ID2,
sep="_")

**#**

**Subsetting data**

subset(YourData, ColumnLabel#==

*stipulation#*)
#portion between # and # = stipulation,

#e.g. durationHours==12 (only includes 12h
trials)

#to stipulate multiples, use ==c("crit1", "crit2", "critx")

#to stipulate multiples, use ==c("crit1", "crit2", "critx")

#to drop excluded factors that appear as "ghosts", e.g. in figures

NewData <- droplevels(YourSubDataSet)

NewData <- droplevels(YourSubDataSet)

### ---------------------- GO-TO TESTS ------------------------------------------------

#### 1. Chi-Square

#rows = e.g. species groups tested,
#columns = e.g. habitats ordered grass,
pneum, prop

chisq.test(YourData)

#data input is observations (number of x
recovered in

#treatment 1, 2, 3); it then divides those evenly to calc expected.

#To set expected values, set a new data
series to your expected

#values, then include them within the function by adding the argument

# p=YourExpectedValues

####
2. **ANOVA and Means-comparison**

#Throughout:

#x=response, y=predictor (feed data column labels)

#for interaction, use y*z

#indicate data set from which variables are drawn

summary(testObject) #provides more detailed output

plot(testObject) #often works, variable output

#TestObject = object created with initial
test, e.g. aov(x~y, data)

#Note: basic assumptions can be evaluated by plotting the model
#(e.g. “plot(aov(x~y, data))”). Fxn returns variance plot,

#qqplot (for
normality of errors), residuals vs fitted values,

#and Cook’s stat to identify
influential point (outliers).

**#**

**Testing homogeneity of variances:**

#Bartlett Test of Homogeneity of Variances (parametric)

#y is a numeric variable and G is the grouping variable.

bartlett.test(y~G, data=mydata)

# Homogeneity of Variance Plot

library(HH)

#syntax: hov(y~G, data=mydata)...ditto plot but with hovPlot()

**#**

**t-test**:

#between 2 groups

t.test(x~y,YourData)

**#**

**ANOVA**:

#between 2 or more groups

aov(x~y, data=YourData)

#can add interaction via x~y*z

**#**

**Post-hoc means comparison**:

TukeyHSD(TestObject)

#Or simply try summary.lm(model) to get the
same basic output #(comparing levels of factor for effect sizes)

###
**---------------------------PLOTTING---------------------------------------**

####
**1. Box plot**

plot(x, y,
xlab=”YourNameforX”, ylab=”YourNameforY”)

#add $data to x
& y as needed

####
**2. ****Bar chart w/ SE**

#Preparation for Bar plot w/ SE using the package

__gplots__

#This simply requires the data that you want to plot (means and SE)

# and any grouping factor that you want each bar to depict

means <-tapply(YourData$Response,

INDEX=list(YourData$GroupingFactor),FUN=mean)

std <- tapply(YourData$Response,

INDEX=list(YourData$GroupingFactor),FUN=sd)

SE <- std/(sqrt(table(YourData$GroupingFactor)))

means

std

SE

#Now plot the objects we just created

require(gplots)

barplot2(means, plot.ci=TRUE, ci.l=means-SE, ci.u=means+SE,

ylab="Your label")

box() #this adds a perimeter between your graph and the axes

#You can use col= to add colors

#I use colors to distinguish my groups (each bar).

#To do this, create a palette of colors

YourPalette <-c("color1", "color2", "color3") #use color names or #

#Then add it to the graph code using the argument

col=YourPalette

###Alternatively, here is a much longer way to make bar charts w/SE

# To plot a bar chart with SE, use this ridiculous mess of code

#To plot means and
standard errors

#we need to use
barplot or barplot2() and tapply

#create a matrix
containing the mean length for

#each category using
tapply()

tapply(response
variable , INDEX = list( grouping variables ),

desired summary stat)
#example syntax:

tapply( data$length
, INDEX = list(data$site), mean)

means <- tapply(
data$length,INDEX=list(data$cat1,data$cat2),mean )

std <- tapply(
data$length,INDEX=list(data$cat1,data$cat2),sd )

std

means

# to plot the means
(without SE)

barplot(means,beside=TRUE,ylim=c(0,200),col=c("gray","white"))

#Add a frame around
the figure

box()

#You can modify the ylim based on your own data range

# to plot means w/
SE, make a matrix of standard errors

#Start with a table
of sample sizes

#Basic syntax:

table(data$TreatmentGraphed) #separate multiple by commas

#Standard error of
the mean = standard deviation / squareroot of sample size

SE<-std/(sqrt(table(data$TreatmentGraphed)))

SE

# now load the
plotting package

__gplot__
#use all the pieces we've made to make the final
figure

require(gplots)

barplot2(means,

beside = TRUE,

col=c("gray","white"),

plot.ci = TRUE,

ci.l = means-SE,

ci.u = means+SE,

ylim = c(100,180),

xpd=FALSE,

ylab="Response (cm)")

box()

#Add a legend

te<-c("Name1","Name2")

pc<-c(22,22)

b<-c("gray","white")

ce<-c(1.5,1.5)

legend(5,190,pch=pc,legend=te,pt.cex=ce,pt.bg=b)

####
**3. Effects from other model output**

# Use the

__allEffects__package
plot(allEffects(model))

# It is incredibly helpful to see the effects & fit of each model

# Use the

__visreg__package
visreg(model, "predictor")

# This is nice because it gives you side by side panels of levels

# More details follow in the GLM/GAM section below

###
**---------------------------MAPPING-----------------------------------------**

#Can load basic map (e.g. states) from **---------------------------MAPPING-----------------------------------------**

__maps__package

TheStates <-
map_data("state")

# Reduce to a state or region with

region <- subset(TheStates, region %in% c(
"statename1", "statename2", ...))

# Plot with the

__ggplot2__package
#overlay landmarks (e.g. research sites) onto the map

wYourSites <- StatePlot + geom_point(
data=location, hjust=0.5, vjust=-0.5, aes(x=long, y=lat),
color="grey", size=2 )## ------------------GLM & GAM basics ---------------------------

*#First: packages, diagnostics, etc are listed below.*

*#Models follow.*

**#Packages**

library(effects) #all effects plots for visualization

library(car) #GLM

library(MASS) #negative binomial model

library(pscl) #zero-inflated models

library(mgcv) #GAM

library(AICcmodavg) #AICc comparison of GAMs

library(visreg) #to plot/visualize GAM output (sidexside panels)

**#Function to calculate overdispersion remaining in a model**

dispersion <- function(model) {

sum(residuals(model, type = "pearson")^2)/(length(model$y) - length(model$coefficients))

}

#Note: function provided by K. Edwards

#Diagnostics

#Diagnostics

plot(data or model)

hist(data)

plot(allEffects(model))

summary(model)

Anova(model) #use test='F' for quasipoisson

AIC(model)

AICc(model) #especially for GAMs

gamcheck(model)

visreg(model, "predictor")

**#Plot residuals vs the predictors**for each model

plot(residuals(model, type = "response") ~ fitted(model), ylab = "raw residuals", xlab = "predicted value")

abline(h = 0, lty = 2, lwd = 4, col = 'red')

# Model structure and functions

# for all models, can use multiple predictors...

# ...with +(additive) or *(interaction)

#### 1. GLM

# GLM basic model (start with Poisson), function in "car" packageglm.model = glm(Response ~ Predictor, family = poisson,

data = YourData)

# check residuals

# If overdispersed, improve fit by changing family to =quasipoisson

# Diagnose partially by multiplier (bigger = model...

# ...doing more maximize fit of overdisp resid)

# Still overdispersed? Try negbin family (see below)

#GLM model with negative binomial family, function in "MASS" package

nb.glm.model <-glm.nb(Response ~ Predictor, data=YourData)

# Diagnose partially with theta (smaller = stronger model...

# ...doing more to fit overdispersed residuals)

#### 2. Zero-inflated GLM

# zero-inflated model, function from "pscl" packageZI.model = zeroinfl(Response ~ Predictor |1, data = YourData,

dist = "poisson")

# model structure= Count model | zero (binomial) model

# Use 1 as null intercept in binomial model

# Can replace with predictors

# dist=" " can be "poisson" or "negbin"

# General rule: use dist that fit best in initial GLM model

# Use "poisson" if overdispersion is solely due to Z.I.

# Use "negbin" when there is Z.I. and other overdispersion

#### 3. GAM

#basic GAM model, function from "mgcv" packagegam.model = gam(Response ~ s(ContPredictor), data=crab, family ="nb")

#Can include regular predictors and/or combine with smoothers

#s(predictor) is a smoothing function that operates on a cont predictor

#When few points in continuous predictor, include k as

# s(predictor, k=#) to reduce knots (few less than # points)

#for interaction within smoother, specify

# s(ContPredictor, by=GroupingPredictor)

# without family specification, defaults to normal distribution

# note = "nb" (not "negbin"), see resource file for others

#to plot one GAM smoothed group at a time

plot(gam.model, select=smoother#, trans=exp,

shift=coef(gam.model)[1] +

coef(gam.model)[#], main="Title")

#Being fed specifications from model output:

# smoother# is the # of the relevant smoother group in list

# [1] is the intercept value

# [#] is the num. of the predictor effect in list(after intercept)

# when other predictors too, can add number (e.g. +1)

# to account for those effects (diff between those and intercept)

### *******************Credits*****************************

*Code is always ongoing and collaborative, but these folks may have significantly contributed to the code listed here. My many thanks to them!*

Alex Forde, UMD

Mayda Nathan, UMD

Dr. Kyle Edwards, UHI

Dr. Megan Riley, USC alum

## Comments

## Post a Comment