## NCEAS plant traits DGS- spring 2008## R tutorial week 6## Nathan Kraft & David Ackerly, UC Berkeley## nkraft@berkeley.edu, dackerly@berkeley.edu

##data import (same as before):raw <- read.csv(file="cornwellPlots.csv")
names(raw) <- c("plot", "species", "abund", "sla", "leaf_area", "seedsize", "wood_density")


trait_means <- read.csv(file="CornwellTraitMeans.csv")[,-1]#######################################################  data matching   #######################################################################there are several essential commands for intersecting and manipulating databases- we've touched on them before but today we'll use them more explicitly.

##The first we've seen a lot:

?subset

##this is used to take a subset of a dataframe with a condition that you specify.  Let's say we want a trait dataset for only the common species at Will's sites.  We can define common here as 200 or more individuals.

common <- subset(trait_means, trait_means$total_abundance>=200)

##as you can see above, the function takes the dataframe (or vector) you want to subset and then a logical condition that specifies the part of the dataframe that you want- here we want all rows where total_abundance is 200 or greater.

## Note that you don't need to include the data frame name again, if your condition refers to one of the columns, so the command above can also be written:

common <- subset(trait_means, total_abundance>=200)

## Check the result
head(common)

##you can accomplish a simmilar feat with the function "which"
?which

#as you can see, "which" returns the indices of a vector of true/ false variables that are TRUE.  

##Lets use it to make a vector of rare species:

##to build up, start with a test that will give you a vector of true and false:

trait_means$total_abundance<200

##now we can wrap that vector in "which()" to get the indices that are true:

which(trait_means$total_abundance<200)

##now we can use those numbers to index the rows of the dataframe that we want:

##note that I am indexing certain rows, then asking for all columns by leaving it blank after the ',':
rare <- trait_means[which(trait_means$total_abundance<200),]

head(rare)

## Just to check, let's do the same thing with subset:
rare2 <- subset(trait_means,total_abundance<200)

## all.equal is a useful command to check if 2 objects are the same, without having to compare all of the indidivual entries:
all.equal(rare,rare2)

dim(rare)
dim(common)
dim(trait_means)

##looks like we've got all of the species assigned to either common or rare (7+47=54).  But lets check for sure. 

##A great function for this is %in%

?%in%

##"%in%" and "match" are simmilar and share a help page- lets start with "%in%":

##the "function" %in% is actually an operator (like + or *) and therefore it goes in between two variables.  It tells you if anything from the second variable is found in the first:

common$species %in% rare$species

##you should get a vector of 7 "false"s.  This tells you that none of the species in the common dataframe  (the smaller of the two)match the species in the rare one.

##order matters:

rare$species %in% common$species

##now you should have a vector of 47 falses, indicating that none of the rare species are found in the common dataframe.  Looks good!

##remember you can sum trues and falses (false=0, true=1), so you could use something like this as a quick test in piece of code:

if(sum(rare$species %in% common$species)==FALSE){
	print("no species overlap found")
	}

##lets try a simpler example with some overlap in the data:

one<-c("red", "blue", "purple", "navy", "black")
two<-c("yellow", "orange", "red")

one %in% two

##should be a single TRUE in the first position, indicating that the first element of the "one" vector ('red') appears somewhere in the "two" vector, but none of the other entries do. Note however, that we don't know yet where 'red' appears in the second vector. That's coming up!

#VS

two %in% one

##this tells us that the third element of the "two" vector has a match in "one"

##now you can use the true/ false vector to index the original variable and find out which value in 'two' also appears in 'one':

two[which(two %in% one)] ##also works without the "which()"

##note that this is not the same as:
one[which(two %in% one)]

##this kind of mistake (above) is really easy to make with the %in% command- as a reminder, the variable you are indexing should always be the first one in the %in% statement

##This, however, is correct and tells us which value in 'one' also appears in 'two'.
one[which(one %in% two)]


## Now we come to what may be the most powerful command, but also the trickiest to use. The function "match(x,y)" returns the index in vector y, the second argument, where the corresonding item in x is found. So the result will have the same length as x. First, look again at %in%:

one %in% two

#now compare the result of 'match'

match(one, two)

##match returns the index of the second argument that matches each element of the first argument.  This can be very useful in certain situations. 

##so you would use:
two[match(one, two)]

## Note that items in the first vector can repeat, and match will return the corresponding index in the second vector each time. However, if items in the second vector repeat, then only the position of the first one will be returned. Like this:

three = c('red','blue','red')
four = c('purple','yellow','red','blue')

match(three,four)
match(four,three)

## PROBLEM 1:
## Write a few lines of code to produce the elements of the one vector that are not in two, and vice versa.  Remember that the "!" reverses true and false.

## PROBLEM 2:
##Now that you've done that, write a function that takes a data set of species occurrences in one or more plots, and another data set of species trait values, and returns a new data frame with the appropriate trait values assigned to each entry in the plot data set. In other words, if you start with:

## data frame 'plots':
# plot	spp
#	1	A
#	1	B
#	2	A
#	2	C

## data frame 'traits':
# species trait
#	A	1.7
#	B	2.2
#	C	3.1

## the function should return the following data frame:
# plot species	trait
#	1	A	1.7
#	1	B	2.2
#	2	A	1.7
#	2	C	3.1

## Start with this, and add your script inside the brackets. Remember, the traits file might have multiple traits. Can you give the user the option of which one to use, or to fill in the entire matrix? MAKE SURE to COMMENT your function so someone else can read it clearly!!


addTraitsToPlots = function(plots,traits) {
	# YOUR SCRIPT HERE!
	return(plotsWithTraits)
	}

## To test your script, you might want to create the small data frames above, and then strip the traits off of the 'raw' data to create the plots data:

head(raw)
cornwellPlots = raw[,1:2]
addTraitsToPlots(cornwellPlots,trait_means)

## PROBLEM 3:
## Write a function that replaces all of the NA's in the "raw" datafile with species trait averages from the "trait_means" dataframe.  You should only need to do this for sla and leaf_area, but you might consider what the function will do if there are no NA's!  Assume that the "raw" file and the "trait_means" file have matching column names- that is- that "species" is called "species" in both, etc.  MAKE SURE to COMMENT your function so someone else can read it clearly!!

##we've started if for you below.  The first two arguments are dataframes, the third is a character string that must match a trait column in both of the dataframes. 

replace_trait_NAs <-function(plot_data, trait_means, which.trait){
	
	
	}


##to run:
replace_trait_NAs(raw, trait_means, "sla")->temp

replace_trait_NAs(temp, trait_means, "leaf_area")->temp2

##as a check- there should be no seedsize NA's
replace_trait_NAs(temp2, trait_means, "seedsize")->temp3

full_plot <- temp3


######################
##Now you should have a plot by species by trait dataframe with no NA's in it! Please call it "full_plot" to work with the rest of this assignment

##lets just run a few diagnostics to check:

sum(is.na(full_plot))  ##should be 0

PO_raw <- subset(raw, raw$species=="Toxicodendron diversilobum")

PO_full <- subset(full_plot, full_plot$species=="Toxicodendron diversilobum")

PO_raw
PO_full
##should look good so far


##which poison oak SLA values do not match in the original and new  plot dataset?

is.na(PO_full$sla != PO_raw$sla)

##are those NA's all the same value?
PO_full$sla[is.na(PO_full$sla != PO_raw$sla)]

##should all be 173.4001- the overall mean for poison oak

##its a good idea to get in the habit of designing little tests like this for your new functions!  Its the best way to catch bugs!

##########################
## trait space tests ####
#######################

### Two weeks ago we started to explore the trait spacing tests described in the Stubbs and Wilson reading.  We showed you a function to calculate the standard deviation of nearest neighbor distance:

get_sd_NN_dist=function(vector){
	return(sd(diff(sort(vector))))
	}
	
##and also the average nearest neighbor distance:

get_ave_NN_dist=function(vector){
	return(mean(diff(sort(vector))))
	}
	
##The latter corresponds to test statistic 4 in the Stubbs and Wilson paper. 

## we would like you to write functions for a few more of the tests in this paper- refer to it for more details:

##TS 1- mean nearest neighbor euclidean distance.  This should be able to work with 1 or more traits, and it should calculate the average of all euclidean nearest neighbor distances in a given plot.  For one trait, the answer should be the same as get_ave_NN_dist().  Also- it should have and argument 'scaled' that will z-score the data if it is set to TRUE. 

##here is a nice small sample community to play with:

seventeen <- subset(full_plot, full_plot$plot=="17")

## PROBLEM 4:

get_mean_NN_ED=function(trait_matrix, scaled==TRUE){
	mean_distance<-NULL
	##your code here:
	
	return(mean_distance)
	}

##and a big hint!!
?dist
## or for slightly more options:
require(vegan)
?vegdist
## now test your function with:
get_mean_NN_ED(seventeen[,4:7])

## PROBLEM 5:	
##TS 3, minimum euclidean distance, should be trivial now if you have the first one working- give it a shot!!

get_min_NN_ED=function(trait_matrix, scaled==TRUE){
	min_distance<-NULL
	##your code here:
	
	return(min_distance)
	}
	
## PROBLEM 6:
## Once you've got functions working, you can call them from inside another function. This kind of partitioning is good scripting practice so you avoid repeating the same code over and over - just wrap it up in a function and call it when you need it. As an example, create a function which returns a maatrix of univariate and multivariate results from a trait_matrix


spacingStats = function(trait_matrix,scaled=TRUE) {
	
	return(results)
	}

## where 'results' has the following structure for a plot:
## trait	stat	value
##	sla		sdNNdist	xxx
## 	sla		aveNNdist	xxx
## [other traits and stats here, and then the multivariate:]
##	all		meanNN_ED	xxx
##	all		minNN_ED	xxx

## try your new function with:
spacingStats(seventeen,4:7)


## PROBLEM 7:
## As a next step, expand this function so you can pass the full trait matrix for multiple plots and multiple traits, and the script will step through one plot at a time, extracting the relevant trait data, conducting each of the tests, and returning a matrix with all of the results. Remember that NNdist is calculated for one trait at a time, while NN_ED is on the full multivariate data set - how are you going to structure the results to handle this??