Assignment 9

1. Modify your existing functions so that only the dataframe is the argument input

##################################################
# FUNCTION: anova
# performs an analysis of variance
# inputs: a vector of categorical data (x) and a vector of continuous data (y)
# outputs: degrees of freedom, sums of squares, mean, F-value, and p-value
#------------------------------------------------- 
anova <- function(dataFrame=data.frame(as.factor(rep(c("High","Low"),each=5)),runif(10))) {
  x <- dataFrame[,1]
  y <- dataFrame[,2]
  newDataFrame <- data.frame(x,y)
  myMod <- aov(y~x, data = newDataFrame)
  myOut <- summary(myMod)
  return(myOut)
}

anova()

##             Df Sum Sq Mean Sq F value   Pr(>F)    
## x            1 1.1976  1.1976      73 2.71e-05 ***
## Residuals    8 0.1312  0.0164                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

##################################################
# FUNCTION: anovaGraph
# performs an analysis of variance and creates a boxplot depicting the results
# inputs: a vector of categorical data (x) and a vector of continuous data (y)
# outputs: barplot 
#------------------------------------------------- 
anovaGraph <- function(dataFrame=data.frame(as.factor(rep(c("High","Low"),each=5)),runif(10))) {
  x <- dataFrame[,1]
  y <- dataFrame[,2]
  newDataFrame <- data.frame(x,y)
  myMod <- aov(y~x, data = newDataFrame)
  myOut <- boxplot(y~x,data=newDataFrame,col=c("deepskyblue4","deepskyblue3","deepskyblue2"))
  return(myOut)
}

anovaGraph()

## $stats
##           [,1]      [,2]
## [1,] 0.2933598 0.5766454
## [2,] 0.4401457 0.5766454
## [3,] 0.4936153 0.6816531
## [4,] 0.6486385 0.8870582
## [5,] 0.6978159 0.9129020
## 
## $n
## [1] 5 5
## 
## $conf
##           [,1]      [,2]
## [1,] 0.3462948 0.4623161
## [2,] 0.6409358 0.9009900
## 
## $out
## [1] 0.01007
## 
## $group
## [1] 2
## 
## $names
## [1] "High" "Low"

2. Use your data from HW4 to input into your functions (where applicable) and produce output and plots

# Read in the data from Homework 4
z <- read.table("Data.csv", header = TRUE, sep=",", stringsAsFactors=FALSE)

# Effect of warming treatment on Shannon diversity
x <- data.frame(z$WarmingTreatment,z$DivShann)
anova(x)

##              Df Sum Sq Mean Sq F value Pr(>F)
## x             1   0.83  0.8276   1.486  0.225
## Residuals   118  65.71  0.5568

anovaGraph(x)

## $stats
##           [,1]      [,2]
## [1,] 0.7577747 0.6171423
## [2,] 2.1728058 2.0442667
## [3,] 2.8217602 2.6659332
## [4,] 3.2198981 3.1863547
## [5,] 4.1736013 4.4157717
## 
## $n
## [1] 60 60
## 
## $conf
##          [,1]     [,2]
## [1,] 2.608177 2.432973
## [2,] 3.035343 2.898893
## 
## $out
## [1] 0.2678693
## 
## $group
## [1] 2
## 
## $names
## [1] "control" "warmed"

# Effect of warming treatment on richness
y <- data.frame(z$WarmingTreatment,z$Richness)
anova(y)

##             Df Sum Sq Mean Sq F value Pr(>F)
## x            1    552     552   0.097  0.757
## Residuals   58 330361    5696               
## 60 observations deleted due to missingness

anovaGraph(y)

## $stats
##       [,1] [,2]
## [1,]  10.0    6
## [2,]  37.0   35
## [3,]  89.5  102
## [4,] 152.0  160
## [5,] 296.0  256
## attr(,"class")
##   control 
## "integer" 
## 
## $n
## [1] 30 30
## 
## $conf
##           [,1]     [,2]
## [1,]  56.32627  65.9416
## [2,] 122.67373 138.0584
## 
## $out
## numeric(0)
## 
## $group
## numeric(0)
## 
## $names
## [1] "control" "warmed"

# Effect of warming treatment on evenness
a <- data.frame(z$WarmingTreatment,z$Evenness)
anova(a)

##             Df Sum Sq Mean Sq F value Pr(>F)  
## x            1 0.0733 0.07334   3.815 0.0556 .
## Residuals   58 1.1149 0.01922                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 60 observations deleted due to missingness

anovaGraph(a)

## $stats
##           [,1]      [,2]
## [1,] 0.4467246 0.3090829
## [2,] 0.6190977 0.5025169
## [3,] 0.7214350 0.6102760
## [4,] 0.7725752 0.7655159
## [5,] 0.9093725 0.9024945
## 
## $n
## [1] 30 30
## 
## $conf
##           [,1]      [,2]
## [1,] 0.6771618 0.5344094
## [2,] 0.7657083 0.6861425
## 
## $out
## numeric(0)
## 
## $group
## numeric(0)
## 
## $names
## [1] "control" "warmed"

3. Make a random data set that mimics your existing data (test mean and variance) and use the functions

# Read in original data
z <- read.table("Data.csv", header = TRUE, sep=",", stringsAsFactors=FALSE)

# I chose to focus on one of my response variables (Shannon diversity, or "DivShann"). To begin, I examined the distribution of my data, and calculated the mean and standard deviation:
hist(z$DivShann)

mean(z$DivShann)

## [1] 2.644833

sd(z$DivShann)

## [1] 0.7477299

# Given the distribution of my data, to create a random data set with structure similar to my original data, I decided to use the 'rsnorm' function to simulate a skewed normal distribution. I used the sample size, mean, and standard deviation from my actual data set as inputs in the 'rsnorm' function. I then tested out different values for the skewness parameter (xi) and used histograms to compare each simulated random data set to my own data until the distributions matched relatively closely.

library(fGarch)

## Loading required package: timeDate

## Loading required package: timeSeries

## Loading required package: fBasics

##

## Rmetrics Package fBasics

## Analysing Markets and calculating Basic Statistics

## Copyright (C) 2005-2014 Rmetrics Association Zurich

## Educational Software for Financial Engineering and Computational Science

## Rmetrics is free software and comes with ABSOLUTELY NO WARRANTY.

## https://www.rmetrics.org --- Mail to: info@rmetrics.org

par(mfrow = c(2,1))

# Random data with xi = -5 compared to my data
hist(rsnorm(120, mean = 2.644833, sd = 0.7477299, xi = -5)) 
hist(z$DivShann)

# Random data with xi = -3 compared to my data
hist(rsnorm(120, mean = 2.644833, sd = 0.7477299, xi = -3)) 
hist(z$DivShann)

# Random data with xi = -4 compared to my data (closest match) 
hist(rsnorm(120, mean = 2.644833, sd = 0.7477299, xi = -4)) 
hist(z$DivShann)

# Create random data using xi = -4
y <- rsnorm(120, mean = 2.644833, sd = 0.7477299, xi = -4)

# Test the effect of warming treatment on the response using the random data
randomData <- data.frame(z$WarmingTreatment,y)
anova(randomData)

##              Df Sum Sq Mean Sq F value Pr(>F)
## x             1   0.00  0.0024   0.003  0.954
## Residuals   118  82.64  0.7004

par(mfrow = c(1,1))
anovaGraph(randomData)

## $stats
##           [,1]      [,2]
## [1,] 0.8384443 0.5892447
## [2,] 2.0482677 2.1199834
## [3,] 2.8254233 2.8694451
## [4,] 3.2616891 3.1531174
## [5,] 3.6319215 3.6139021
## 
## $n
## [1] 60 60
## 
## $conf
##          [,1]     [,2]
## [1,] 2.577913 2.658709
## [2,] 3.072934 3.080181
## 
## $out
## [1] -0.792271
## 
## $group
## [1] 1
## 
## $names
## [1] "control" "warmed"

4. Manipulate your random data set to mimic a hypothesis you created (i.e. I expect a positive relationship between the number of cats someone owns and that person’s IQ) and use the functions

# Hypothesis: Shannon diversity is higher in warmed plots relative to control plots.

print(z$WarmingTreatment) # view data

##   [1] "control" "warmed"  "control" "warmed"  "control" "warmed"  "control"
##   [8] "warmed"  "control" "warmed"  "control" "warmed"  "warmed"  "control"
##  [15] "warmed"  "control" "control" "warmed"  "control" "warmed"  "control"
##  [22] "warmed"  "control" "warmed"  "control" "warmed"  "control" "warmed" 
##  [29] "warmed"  "control" "warmed"  "control" "warmed"  "control" "warmed" 
##  [36] "control" "warmed"  "control" "control" "warmed"  "control" "warmed" 
##  [43] "control" "warmed"  "control" "warmed"  "control" "warmed"  "control"
##  [50] "warmed"  "warmed"  "control" "warmed"  "control" "warmed"  "control"
##  [57] "warmed"  "control" "warmed"  "control" "control" "control" "warmed" 
##  [64] "control" "warmed"  "control" "warmed"  "warmed"  "control" "warmed" 
##  [71] "control" "warmed"  "control" "warmed"  "control" "warmed"  "control"
##  [78] "warmed"  "control" "warmed"  "control" "warmed"  "warmed"  "control"
##  [85] "warmed"  "control" "warmed"  "control" "warmed"  "control" "warmed" 
##  [92] "control" "control" "warmed"  "control" "warmed"  "control" "warmed" 
##  [99] "control" "warmed"  "control" "warmed"  "control" "warmed"  "warmed" 
## [106] "control" "warmed"  "control" "warmed"  "control" "warmed"  "control"
## [113] "warmed"  "control" "control" "warmed"  "control" "warmed"  "control"
## [120] "warmed"

z2 <- z[order(z$WarmingTreatment),] # sort data by warming treatment 
print(z2$WarmingTreatment) # view sorted data

##   [1] "control" "control" "control" "control" "control" "control" "control"
##   [8] "control" "control" "control" "control" "control" "control" "control"
##  [15] "control" "control" "control" "control" "control" "control" "control"
##  [22] "control" "control" "control" "control" "control" "control" "control"
##  [29] "control" "control" "control" "control" "control" "control" "control"
##  [36] "control" "control" "control" "control" "control" "control" "control"
##  [43] "control" "control" "control" "control" "control" "control" "control"
##  [50] "control" "control" "control" "control" "control" "control" "control"
##  [57] "control" "control" "control" "control" "warmed"  "warmed"  "warmed" 
##  [64] "warmed"  "warmed"  "warmed"  "warmed"  "warmed"  "warmed"  "warmed" 
##  [71] "warmed"  "warmed"  "warmed"  "warmed"  "warmed"  "warmed"  "warmed" 
##  [78] "warmed"  "warmed"  "warmed"  "warmed"  "warmed"  "warmed"  "warmed" 
##  [85] "warmed"  "warmed"  "warmed"  "warmed"  "warmed"  "warmed"  "warmed" 
##  [92] "warmed"  "warmed"  "warmed"  "warmed"  "warmed"  "warmed"  "warmed" 
##  [99] "warmed"  "warmed"  "warmed"  "warmed"  "warmed"  "warmed"  "warmed" 
## [106] "warmed"  "warmed"  "warmed"  "warmed"  "warmed"  "warmed"  "warmed" 
## [113] "warmed"  "warmed"  "warmed"  "warmed"  "warmed"  "warmed"  "warmed" 
## [120] "warmed"

# Simulate normally distributed random data for each warming treatment, and combine the two vectors of data
warmed <- rnorm(60, mean = 4, sd = 0.7)
control <- rnorm(60, mean = 2, sd = 0.7)
combined <- c(warmed,control)

# Test the effect of warming treatment on the simulated Shannon diversity data
randomDataBling <- data.frame(z2$WarmingTreatment,combined)
anova(randomDataBling)

##              Df Sum Sq Mean Sq F value Pr(>F)    
## x             1 122.94   122.9   206.1 <2e-16 ***
## Residuals   118  70.37     0.6                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

par(mfrow = c(1,1))
anovaGraph(randomDataBling)

## $stats
##          [,1]      [,2]
## [1,] 2.446143 0.3080192
## [2,] 3.376627 1.3304953
## [3,] 3.952301 1.9033361
## [4,] 4.425954 2.4290995
## [5,] 5.268086 3.4315183
## 
## $n
## [1] 60 60
## 
## $conf
##          [,1]     [,2]
## [1,] 3.738262 1.679246
## [2,] 4.166340 2.127426
## 
## $out
## [1] 1.734383
## 
## $group
## [1] 1
## 
## $names
## [1] "control" "warmed"

Use logistic regression to see how Shannon diversity of a sample might predict whether it is from a warmed or control plot

Logistic regression functions

##################################################
# FUNCTION: logReg
# performs a logistic regression analysis
# inputs: a vector of continuous data (x) and a vector of categorical data (y)
# outputs: slope, standard error, z-value, and p-value for the logistic regression
#------------------------------------------------- 
logReg <- function(x = rgamma(n=25,shape=5,scale=5), y = rbinom(n=25,size=1,p=0.5)) {
  dataFrame <- data.frame(x,y)
  myMod <- glm(y ~ x,
                 data=dataFrame,
                 family=binomial(link="logit"))
  return(summary(myMod)$coefficients)
}

##################################################
# FUNCTION: logRegGraph
# performs a logistic regression analysis and creates a scatterplot depicting the results
# inputs: a vector of continuous data (x) and a vector of categorical data (y)
# outputs: scatterplot
#------------------------------------------------- 
logRegGraph <- function(x = rgamma(n=25,shape=5,scale=5), y = rbinom(n=25,size=1,p=0.5)) {
  dataFrame <- data.frame(x,y)
  myMod <- glm(y ~ x,
                 data=dataFrame,
                 family=binomial(link="logit"))
  myOut <- plot(x=dataFrame$x, y=dataFrame$y,pch=21,bg="green",cex=2)
  return(myOut)
}

Using logistic regression functions on the three data sets

# Actual data
z <- read.table("Data.csv", header = TRUE, sep=",", stringsAsFactors=FALSE)
logReg(x=z$DivShann, y=z$WarmingTreatment)

##               Estimate Std. Error   z value  Pr(>|z|)
## (Intercept)  0.8042558  0.6887884  1.167638 0.2429526
## x           -0.3038786  0.2504912 -1.213131 0.2250797

logRegGraph(x=z$DivShann, y=z$WarmingTreatment)

## NULL

# Random data with same structure as original data
x <- rsnorm(120, mean = 2.644833, sd = 0.7477299, xi = -4)
randomData <- data.frame(z$WarmingTreatment,x)
logReg(x=randomData$x, y=randomData$z.WarmingTreatment)

##               Estimate Std. Error   z value   Pr(>|z|)
## (Intercept)  1.2670118  0.6320632  2.004565 0.04500957
## x           -0.5139291  0.2442263 -2.104315 0.03535096

logRegGraph(x=randomData$x, y=randomData$z.WarmingTreatment)

## NULL

# Random data related to hypothesis
warmed <- rnorm(60, mean = 4, sd = 0.7)
control <- rnorm(60, mean = 2, sd = 0.7)
combined <- c(warmed,control)
randomDataBling <- data.frame(z2$WarmingTreatment,combined)
logReg(x=randomDataBling$combined, y=randomDataBling$z2.WarmingTreatment)

##              Estimate Std. Error   z value     Pr(>|z|)
## (Intercept) 10.425074   1.971131  5.288880 1.230677e-07
## x           -3.388557   0.620611 -5.460034 4.760447e-08

logRegGraph(x=randomDataBling$combined, y=randomDataBling$z2.WarmingTreatment)

## NULL