Bonus
Data mangling
This script shows the steps I took to prepare the mangled dataframes used in the tidy data examples in this workshop.
# Load libraries
library(tidyverse)
library(lubridate)
# Load data
sst_NOAA <- read_csv("course_material/data/sst_NOAA.csv")Mangle
And now begins the mangling.
# Sites to extract
sites <- c("Med", "NW_Atl", "WA")
# Create tidy base
OISST_tidy <- sst_NOAA %>%
mutate(year = year(t)) %>%
filter(site %in% sites,
year %in% c(2008, 2009)) %>%
select(-year)
# First mangle
# Normal tidy data
OISST1 <- OISST_tidy
# Second mangle
OISST2 <- OISST_tidy %>%
pivot_wider(names_from = site, values_from = temp)
# Third mangle
OISST3 <- OISST_tidy %>%
mutate(t = as.character(t),
idx = 1:n()) %>%
pivot_longer(cols = c(site, t), names_to = "type", values_to = "name") %>%
dplyr::select(idx, type, name, temp)
## Fourth two part mangle
# A
OISST4a <- OISST_tidy %>%
mutate(t = as.character(t)) %>%
unite(index, site, t, sep = " ")
# B
OISST4b <- OISST_tidy %>%
mutate(t = as.character(t),
idx = 1:n()) %>%
separate(col = t, into = c("year", "month", "day"), sep = "-") %>%
select(-temp)Save
Here we save all five of the newly mangled dataframes as one .RData object for ease of loading in the tutorial.
save(list = c("OISST1", "OISST2", "OISST3", "OISST4a", "OISST4b"), file = "course_material/data/OISST_mangled.RData")Animations
In this supplemental tutorial we are going to look at how to create animations in R. To do this will require the installation of software outside of R. This software is ImageMagick and may be downloaded here: https://www.imagemagick.org/script/download.php. Once this software has been installed on your computer it will be necessary to install the animation library.
# The libraries required for this tut
library(tidyverse)
library(grid)
library(gridExtra)
library(animation)Functions for creating ant walks
# Calculate speed based on u and v vectors
ant.speed <- function(df){
df$x2 <- c(NA,df$x[2:nrow(df)] - df$x[1:(nrow(df)-1)])
df$y2 <- c(NA,df$y[2:nrow(df)] - df$y[1:(nrow(df)-1)])
speed_abs <- round(sqrt(df$x2^2 + df$y2^2),2)
speed_abs[is.na(speed_abs)] <- 0
return(speed_abs)
}
# Create a dataframe with desired number of ants and steps
ant.walk <- function(i,n){
# Create the random walks
walk_x <- c(0,round(cumsum(rnorm(n = n-1, mean = 0, sd = 1)),2))
for(i in 2:i){
x <- c(0,round(cumsum(rnorm(n = n-1, mean = 0, sd = 1)),2))
walk_x <- c(walk_x, x)
}
walk_y <- c(0,round(cumsum(rnorm(n = n-1, mean = 0, sd = 1)),2))
for(i in 2:i){
y <- c(0,round(cumsum(rnorm(n = n-1, mean = 0, sd = 1)),2))
walk_y <- c(walk_y, y)
}
# Create the walking dataframe
walker <- data.frame(x = walk_x, y = walk_y,
ant = as.factor(rep(1:i, each = n)),
step = rep(seq(1,n), i))
walker$speed <- ant.speed(walker)
walker$speed[walker$step == 1] <- 0
return(walker)
}Generate the ants
# Increase the second number for a longer animation
ants <- ant.walk(5, 20)The function to animate the walk plot
walk.plot <- function(i){
# Map figure
walk_map <- ggplot(data = ants[ants$step %in% 1:i,], aes(x = x, y = y)) +
geom_path(aes( group = ant), colour = "gray60") +
geom_point(data = ants[ants$step == i,], aes(colour = ant))
# Speed histogram
walk_hist <- ggplot(data = ants[ants$step %in% 1:i,], aes(x = speed)) +
geom_histogram() +
labs(x = "speed")
# Speed line graph
walk_line <- ggplot(data = ants[ants$step %in% 1:i,], aes(x = step, y = speed)) +
geom_line(aes(colour = ant))
# Wack it together
grid.arrange(walk_map, walk_hist, walk_line, layout_matrix = cbind(c(1,1), c(1,1), c(2,3)))
}
## Create animation of ts plots
animate.walk.plot <- function() {
lapply(seq(1, max(ants$step)), function(i) {
walk.plot(i)
})
}Render the GIF
# By default 'saveGIF()' outputs to the same folder as from where the script is.
# I have included the lines here to change your working directory and then change it back.
# Be careful here! This can be easy to cause issues.
setwd("course_material/")
system.time(saveGIF(animate.walk.plot(), interval = 0.2,
ani.width = 800, movie.name = "ant_walk.gif")) ## ~60 seconds
setwd("../")GIFs on plots
Yes my friends, it is true. We may add GIFs to our figures and maps. Rejoice. Better yet, the process is relatively straight forward. We will begin, as usual, by loading our libraries and files.
# Load libraries
library(tidyverse)
library(magick)
# The base image
background <- image_read("images/EU_flag.jpg") %>% # Load file
image_scale("900") # Change resolution
# The gif to overlay
anim_overlay <- image_read("course_material/ant_walk.gif") %>% # Load file
image_scale("300") # Change resolutionGIF creation
Once we have loaded our base image and the GIF we want to put on top of it we need to create a function to make these two different file types ‘kiss’. With the appropriately named magick package this is startlingly easy to do.
frames <- lapply(anim_overlay, function(frame) {
image_composite(background, frame, offset = "+300")
})GIF animation
With our function for creating the GIF sorted, it is now time to animate it!
animation <- image_animate(image_join(frames), fps = 10) # FPS = 10 is native speedGIF save
Jip. Simple as that.
image_write(animation, "course_material/EU_ants.gif")Morphing
Have you ever wanted to animate the transition from one figure to another? No? Me neither. But hey, it’s easy to do, so why not.
# Load libraries
library(magick)
# Load images
newlogo <- image_scale(image_read("https://www.r-project.org/logo/Rlogo.png"), "x150")
oldlogo <- image_scale(image_read("https://developer.r-project.org/Logo/Rlogo-3.png"), "x150")Morph creation
morph_frames <- image_morph(c(oldlogo, newlogo), frames = 100)Morph animation
morph_animate <- image_animate(morph_frames, fps = 20)Morph save
image_write(morph_animate, "course_material/morph.gif")Multivariate stats
To err is human, but to really foul things up you need a computer.
—Paul R. Ehrlich
In this brief tutorial we are going to walk through the steps necessary to perform a most basic ordination. We will be using MDS for this as it produces, in my opinion, the most straight forward results. There is of course an entire school of thought on this and I, a mere climate scientist, am in no way an authoritative voice on the matter.
# Load libraries
library(tidyverse)
library(ggpubr)
library(vegan)
# Load built-in data
data("dune")
data("dune.env")MDS
MDS, or multi-dimensional scaling, is high level clustering technique. MDS allows us to determine which of the abiotic variables in our dataset are having the most pronounced effects on the clustering of the dunes. Running an MDS on a data frame in R is simple as the vegan package will do all of the heavy lifting for us. First we will jump straight in and run an MDS, then we will take a step back and try changing the standardisation of the values and the distance matrix that we would normally need to first calculate. Please consult the help file (?metaMDS) for details on the function.
dune_MDS_1 <- metaMDS(dune)Or we may be more specific in the way in which we prepare our data for the MDS. Look through the help files to see what other options exist.
# Standardise data
dune_stand <- decostand(dune, method = "total")
# Create Bray-Curtis dissimilarity matrix
dune_dist <- vegdist(dune_stand, method = "bray")
# Create distance matrix
dune_MDS_2 <- metaMDS(dune_dist)Stress
No, not that stress. We are talking about the stress of the MDS model now. This is an important value to check. If the stress is high (>0.3) the MDS model is doing a poor job of modeling the dissimilarities in the data. If it is low (<0.1) the model is doing a very good job of displaying the relationships within the data. To check the stress of our results we use the following line of code.
# Default MDS settings
dune_MDS_1$stress
# Determined settings
dune_MDS_2$stressWhat is the stress of this model? Is that an acceptable level?
Basic biplot
With the MDS calculated, and the stress tested, it’s time to visualise the first round of results.
# Convert for ggplot
dune_MDS_points <- data.frame(site = 1:nrow(dune)) %>%
mutate(x = as.numeric(dune_MDS_2$points[ ,1]),
y = as.numeric(dune_MDS_2$points[ ,2]))
# Visualise with ggplot
ggplot(data = dune_MDS_points, aes(x = x, y = y)) +
geom_point(size = 8, shape = 21, fill = "black", colour = "red") +
geom_text(aes(label = site), colour = "white") +
labs(x = "NMDS1", y = "NMDS2")Fitting environmental variables
As with all of the other ordination analyses we have performed in R thus far, fitting environmental variables may also be done with one easy step. We do this by providing the envfit() function with a formula, the same as we do for linear models. The dependent variable (to the left of the ~) will be the results of the MDS on the species assemblage data, and the independent variables (to the right of the ~) are the columns from our environmental variables data frame.
dune_envfit <- envfit(dune_MDS_2 ~ Moisture + Use, data = dune.env)
dune_envfitIn the printout above we see the results for the R^2 (here r2) and p-values for the fit of each abiotic variable to the species assemblage data. Which relationships are significant? Which variable(s) appears to best explain the variance in the species assemblages? Which of the axes of the MDS have the strongest relationship with which variable?
To plot the results of our fitted abiotic variables on top of our species MDS we need to quickly prep it to play nice with ggplot2 and then we need only append a couple of lines onto the chunk we wrote to display our MDS results.
# Extract the envfit vector values
dune_envfit_df <- data.frame(dune_envfit$factors$centroids) %>%
mutate(factors = row.names(.)) %>%
rename(x = NMDS1, y = NMDS2)
# Visualise environmental fits
ggplot(data = dune_MDS_points, aes(x = x, y = y)) +
geom_point(size = 8, shape = 21, fill = "black", colour = "red") +
geom_text(aes(label = site), colour = "white") +
geom_segment(data = dune_envfit_df, arrow = arrow(length = unit(0.25, "cm")),
aes(x = 0, y = 0, xend = x, yend = y)) +
geom_text(data = dune_envfit_df, colour = "red",
aes(x = x, y = y, label = factors)) +
labs(x = "NMDS1", y = "NMDS2")Adding clusters
In order to add clustering we must first create groupings for our data. In this instance we will be calculating our groups using hierarchical cluster analysis.
# Create dendrogram
# Note that this must be run on a distance matrix
dune_clust <- hclust(dune_dist, "ward.D")
# Extract clusters
# In this case we have decided on four clusters
dune_grp <- cutree(dune_clust, 4)
# Extract groups for plotting
dune_MDS_points <- dune_MDS_points %>%
mutate(grp_id = as.factor(dune_grp))With the clusters calculated we may now plot ellipses on our biplot. We will first do this with the built-in functionality of ggplot2, which unfortunately isn’t great.
ggplot(data = dune_MDS_points, aes(x = x, y = y)) +
geom_point(size = 8, shape = 21, fill = "black", colour = "red") +
geom_text(aes(label = site), colour = "white") +
geom_segment(data = dune_envfit_df, arrow = arrow(length = unit(0.25, "cm")),
aes(x = 0, y = 0, xend = x, yend = y)) +
geom_text(data = dune_envfit_df, colour = "red",
aes(x = x, y = y, label = factors)) +
# The ellipses
stat_ellipse(aes(colour = grp_id), type = "t") +
#
labs(x = "NMDS1", y = "NMDS2", colour = "Cluster")If we have very large datasets the ellipses will come more in line with what we want. With small datasets not so much. This is because the ellipses are actually calculating the area under which a certain confidence interval is maintained that the points in that group may be found. If we would rather use polygons to fit directly onto the area of our clusters we do so by replacing the ellipses with the following line of code.
ggplot(data = dune_MDS_points, aes(x = x, y = y)) +
geom_point(size = 8, shape = 21, fill = "black", colour = "red") +
geom_text(aes(label = site), colour = "white") +
geom_segment(data = dune_envfit_df, arrow = arrow(length = unit(0.25, "cm")),
aes(x = 0, y = 0, xend = x, yend = y)) +
geom_text(data = dune_envfit_df, colour = "red",
aes(x = x, y = y, label = factors)) +
# The custom made polygons
stat_chull(geom = "polygon", aes(fill = grp_id), alpha = 0.4) +
#
labs(x = "NMDS1", y = "NMDS2")I’m not super excited about that result either. A third option is to simply change the colour of the points to reflect their grouping.
ggplot(data = dune_MDS_points, aes(x = x, y = y)) +
# Changing point aesthetics
geom_point(size = 8, aes(colour = grp_id)) +
#
geom_text(aes(label = site), colour = "white") +
geom_segment(data = dune_envfit_df,
aes(x = 0, y = 0, xend = x, yend = y)) +
geom_text(data = dune_envfit_df, colour = "red",
aes(label = factors)) +
labs(x = "NMDS1", y = "NMDS2", colour = "Cluster")I think this is actually the cleanest way to visualise the data.
Diversity
If we are interested in calculating a Shannon-Wiener index on the species diversity found within the dunes we need only one function.
diversity(dune)ANOSIM
One final thing. It is also necessary to know if any differences exist between the clusters we have determined for our data. To do this we use the anosim() function from the vegan package.
anosim(dune_dist, dune_grp)