Bonus
Data mangling
This script shows the steps I took to prepare the mangled dataframes used in the tidy data examples in this workshop.
# Load libraries
library(tidyverse)
library(lubridate)
# Load data
<- read_csv("course_material/data/sst_NOAA.csv") sst_NOAA
Mangle
And now begins the mangling.
# Sites to extract
<- c("Med", "NW_Atl", "WA")
sites
# Create tidy base
<- sst_NOAA %>%
OISST_tidy mutate(year = year(t)) %>%
filter(site %in% sites,
%in% c(2008, 2009)) %>%
year select(-year)
# First mangle
# Normal tidy data
<- OISST_tidy
OISST1
# Second mangle
<- OISST_tidy %>%
OISST2 pivot_wider(names_from = site, values_from = temp)
# Third mangle
<- OISST_tidy %>%
OISST3 mutate(t = as.character(t),
idx = 1:n()) %>%
pivot_longer(cols = c(site, t), names_to = "type", values_to = "name") %>%
::select(idx, type, name, temp)
dplyr
## Fourth two part mangle
# A
<- OISST_tidy %>%
OISST4a mutate(t = as.character(t)) %>%
unite(index, site, t, sep = " ")
# B
<- OISST_tidy %>%
OISST4b mutate(t = as.character(t),
idx = 1:n()) %>%
separate(col = t, into = c("year", "month", "day"), sep = "-") %>%
select(-temp)
Save
Here we save all five of the newly mangled dataframes as one .RData object for ease of loading in the tutorial.
save(list = c("OISST1", "OISST2", "OISST3", "OISST4a", "OISST4b"), file = "course_material/data/OISST_mangled.RData")
Animations
In this supplemental tutorial we are going to look at how to create animations in R. To do this will require the installation of software outside of R. This software is ImageMagick
and may be downloaded here: https://www.imagemagick.org/script/download.php. Once this software has been installed on your computer it will be necessary to install the animation
library.
# The libraries required for this tut
library(tidyverse)
library(grid)
library(gridExtra)
library(animation)
Functions for creating ant walks
# Calculate speed based on u and v vectors
<- function(df){
ant.speed $x2 <- c(NA,df$x[2:nrow(df)] - df$x[1:(nrow(df)-1)])
df$y2 <- c(NA,df$y[2:nrow(df)] - df$y[1:(nrow(df)-1)])
df<- round(sqrt(df$x2^2 + df$y2^2),2)
speed_abs is.na(speed_abs)] <- 0
speed_abs[return(speed_abs)
}
# Create a dataframe with desired number of ants and steps
<- function(i,n){
ant.walk # Create the random walks
<- c(0,round(cumsum(rnorm(n = n-1, mean = 0, sd = 1)),2))
walk_x for(i in 2:i){
<- c(0,round(cumsum(rnorm(n = n-1, mean = 0, sd = 1)),2))
x <- c(walk_x, x)
walk_x
}<- c(0,round(cumsum(rnorm(n = n-1, mean = 0, sd = 1)),2))
walk_y for(i in 2:i){
<- c(0,round(cumsum(rnorm(n = n-1, mean = 0, sd = 1)),2))
y <- c(walk_y, y)
walk_y
}# Create the walking dataframe
<- data.frame(x = walk_x, y = walk_y,
walker ant = as.factor(rep(1:i, each = n)),
step = rep(seq(1,n), i))
$speed <- ant.speed(walker)
walker$speed[walker$step == 1] <- 0
walkerreturn(walker)
}
Generate the ants
# Increase the second number for a longer animation
<- ant.walk(5, 20) ants
The function to animate the walk plot
<- function(i){
walk.plot # Map figure
<- ggplot(data = ants[ants$step %in% 1:i,], aes(x = x, y = y)) +
walk_map geom_path(aes( group = ant), colour = "gray60") +
geom_point(data = ants[ants$step == i,], aes(colour = ant))
# Speed histogram
<- ggplot(data = ants[ants$step %in% 1:i,], aes(x = speed)) +
walk_hist geom_histogram() +
labs(x = "speed")
# Speed line graph
<- ggplot(data = ants[ants$step %in% 1:i,], aes(x = step, y = speed)) +
walk_line geom_line(aes(colour = ant))
# Wack it together
grid.arrange(walk_map, walk_hist, walk_line, layout_matrix = cbind(c(1,1), c(1,1), c(2,3)))
}
## Create animation of ts plots
<- function() {
animate.walk.plot lapply(seq(1, max(ants$step)), function(i) {
walk.plot(i)
}) }
Render the GIF
# By default 'saveGIF()' outputs to the same folder as from where the script is.
# I have included the lines here to change your working directory and then change it back.
# Be careful here! This can be easy to cause issues.
setwd("course_material/")
system.time(saveGIF(animate.walk.plot(), interval = 0.2,
ani.width = 800, movie.name = "ant_walk.gif")) ## ~60 seconds
setwd("../")
GIFs on plots
Yes my friends, it is true. We may add GIFs to our figures and maps. Rejoice. Better yet, the process is relatively straight forward. We will begin, as usual, by loading our libraries and files.
# Load libraries
library(tidyverse)
library(magick)
# The base image
<- image_read("images/EU_flag.jpg") %>% # Load file
background image_scale("900") # Change resolution
# The gif to overlay
<- image_read("course_material/ant_walk.gif") %>% # Load file
anim_overlay image_scale("300") # Change resolution
GIF creation
Once we have loaded our base image and the GIF we want to put on top of it we need to create a function to make these two different file types ‘kiss’. With the appropriately named magick
package this is startlingly easy to do.
<- lapply(anim_overlay, function(frame) {
frames image_composite(background, frame, offset = "+300")
})
GIF animation
With our function for creating the GIF sorted, it is now time to animate it!
<- image_animate(image_join(frames), fps = 10) # FPS = 10 is native speed animation
GIF save
Jip. Simple as that.
image_write(animation, "course_material/EU_ants.gif")
Morphing
Have you ever wanted to animate the transition from one figure to another? No? Me neither. But hey, it’s easy to do, so why not.
# Load libraries
library(magick)
# Load images
<- image_scale(image_read("https://www.r-project.org/logo/Rlogo.png"), "x150")
newlogo <- image_scale(image_read("https://developer.r-project.org/Logo/Rlogo-3.png"), "x150") oldlogo
Morph creation
<- image_morph(c(oldlogo, newlogo), frames = 100) morph_frames
Morph animation
<- image_animate(morph_frames, fps = 20) morph_animate
Morph save
image_write(morph_animate, "course_material/morph.gif")
Multivariate stats
To err is human, but to really foul things up you need a computer.
—Paul R. Ehrlich
In this brief tutorial we are going to walk through the steps necessary to perform a most basic ordination. We will be using MDS for this as it produces, in my opinion, the most straight forward results. There is of course an entire school of thought on this and I, a mere climate scientist, am in no way an authoritative voice on the matter.
# Load libraries
library(tidyverse)
library(ggpubr)
library(vegan)
# Load built-in data
data("dune")
data("dune.env")
MDS
MDS, or multi-dimensional scaling, is high level clustering technique. MDS allows us to determine which of the abiotic variables in our dataset are having the most pronounced effects on the clustering of the dunes. Running an MDS on a data frame in R is simple as the vegan
package will do all of the heavy lifting for us. First we will jump straight in and run an MDS, then we will take a step back and try changing the standardisation of the values and the distance matrix that we would normally need to first calculate. Please consult the help file (?metaMDS
) for details on the function.
<- metaMDS(dune) dune_MDS_1
Or we may be more specific in the way in which we prepare our data for the MDS. Look through the help files to see what other options exist.
# Standardise data
<- decostand(dune, method = "total")
dune_stand
# Create Bray-Curtis dissimilarity matrix
<- vegdist(dune_stand, method = "bray")
dune_dist
# Create distance matrix
<- metaMDS(dune_dist) dune_MDS_2
Stress
No, not that stress. We are talking about the stress of the MDS model now. This is an important value to check. If the stress is high (>0.3) the MDS model is doing a poor job of modeling the dissimilarities in the data. If it is low (<0.1) the model is doing a very good job of displaying the relationships within the data. To check the stress of our results we use the following line of code.
# Default MDS settings
$stress
dune_MDS_1
# Determined settings
$stress dune_MDS_2
What is the stress of this model? Is that an acceptable level?
Basic biplot
With the MDS calculated, and the stress tested, it’s time to visualise the first round of results.
# Convert for ggplot
<- data.frame(site = 1:nrow(dune)) %>%
dune_MDS_points mutate(x = as.numeric(dune_MDS_2$points[ ,1]),
y = as.numeric(dune_MDS_2$points[ ,2]))
# Visualise with ggplot
ggplot(data = dune_MDS_points, aes(x = x, y = y)) +
geom_point(size = 8, shape = 21, fill = "black", colour = "red") +
geom_text(aes(label = site), colour = "white") +
labs(x = "NMDS1", y = "NMDS2")
Fitting environmental variables
As with all of the other ordination analyses we have performed in R thus far, fitting environmental variables may also be done with one easy step. We do this by providing the envfit()
function with a formula, the same as we do for linear models. The dependent variable (to the left of the ~
) will be the results of the MDS on the species assemblage data, and the independent variables (to the right of the ~
) are the columns from our environmental variables data frame.
<- envfit(dune_MDS_2 ~ Moisture + Use, data = dune.env)
dune_envfit dune_envfit
In the printout above we see the results for the R^2 (here r2) and p-values for the fit of each abiotic variable to the species assemblage data. Which relationships are significant? Which variable(s) appears to best explain the variance in the species assemblages? Which of the axes of the MDS have the strongest relationship with which variable?
To plot the results of our fitted abiotic variables on top of our species MDS we need to quickly prep it to play nice with ggplot2
and then we need only append a couple of lines onto the chunk we wrote to display our MDS results.
# Extract the envfit vector values
<- data.frame(dune_envfit$factors$centroids) %>%
dune_envfit_df mutate(factors = row.names(.)) %>%
rename(x = NMDS1, y = NMDS2)
# Visualise environmental fits
ggplot(data = dune_MDS_points, aes(x = x, y = y)) +
geom_point(size = 8, shape = 21, fill = "black", colour = "red") +
geom_text(aes(label = site), colour = "white") +
geom_segment(data = dune_envfit_df, arrow = arrow(length = unit(0.25, "cm")),
aes(x = 0, y = 0, xend = x, yend = y)) +
geom_text(data = dune_envfit_df, colour = "red",
aes(x = x, y = y, label = factors)) +
labs(x = "NMDS1", y = "NMDS2")
Adding clusters
In order to add clustering we must first create groupings for our data. In this instance we will be calculating our groups using hierarchical cluster analysis.
# Create dendrogram
# Note that this must be run on a distance matrix
<- hclust(dune_dist, "ward.D")
dune_clust
# Extract clusters
# In this case we have decided on four clusters
<- cutree(dune_clust, 4)
dune_grp
# Extract groups for plotting
<- dune_MDS_points %>%
dune_MDS_points mutate(grp_id = as.factor(dune_grp))
With the clusters calculated we may now plot ellipses on our biplot. We will first do this with the built-in functionality of ggplot2
, which unfortunately isn’t great.
ggplot(data = dune_MDS_points, aes(x = x, y = y)) +
geom_point(size = 8, shape = 21, fill = "black", colour = "red") +
geom_text(aes(label = site), colour = "white") +
geom_segment(data = dune_envfit_df, arrow = arrow(length = unit(0.25, "cm")),
aes(x = 0, y = 0, xend = x, yend = y)) +
geom_text(data = dune_envfit_df, colour = "red",
aes(x = x, y = y, label = factors)) +
# The ellipses
stat_ellipse(aes(colour = grp_id), type = "t") +
#
labs(x = "NMDS1", y = "NMDS2", colour = "Cluster")
If we have very large datasets the ellipses will come more in line with what we want. With small datasets not so much. This is because the ellipses are actually calculating the area under which a certain confidence interval is maintained that the points in that group may be found. If we would rather use polygons to fit directly onto the area of our clusters we do so by replacing the ellipses with the following line of code.
ggplot(data = dune_MDS_points, aes(x = x, y = y)) +
geom_point(size = 8, shape = 21, fill = "black", colour = "red") +
geom_text(aes(label = site), colour = "white") +
geom_segment(data = dune_envfit_df, arrow = arrow(length = unit(0.25, "cm")),
aes(x = 0, y = 0, xend = x, yend = y)) +
geom_text(data = dune_envfit_df, colour = "red",
aes(x = x, y = y, label = factors)) +
# The custom made polygons
stat_chull(geom = "polygon", aes(fill = grp_id), alpha = 0.4) +
#
labs(x = "NMDS1", y = "NMDS2")
I’m not super excited about that result either. A third option is to simply change the colour of the points to reflect their grouping.
ggplot(data = dune_MDS_points, aes(x = x, y = y)) +
# Changing point aesthetics
geom_point(size = 8, aes(colour = grp_id)) +
#
geom_text(aes(label = site), colour = "white") +
geom_segment(data = dune_envfit_df,
aes(x = 0, y = 0, xend = x, yend = y)) +
geom_text(data = dune_envfit_df, colour = "red",
aes(label = factors)) +
labs(x = "NMDS1", y = "NMDS2", colour = "Cluster")
I think this is actually the cleanest way to visualise the data.
Diversity
If we are interested in calculating a Shannon-Wiener index on the species diversity found within the dunes we need only one function.
diversity(dune)
ANOSIM
One final thing. It is also necessary to know if any differences exist between the clusters we have determined for our data. To do this we use the anosim()
function from the vegan
package.
anosim(dune_dist, dune_grp)