Bonus

Data mangling

This script shows the steps I took to prepare the mangled dataframes used in the tidy data examples in this workshop.

# Load libraries
library(tidyverse)
library(lubridate)

# Load data
sst_NOAA <- read_csv("course_material/data/sst_NOAA.csv")

Mangle

And now begins the mangling.

# Sites to extract
sites <- c("Med", "NW_Atl", "WA")

# Create tidy base
OISST_tidy <- sst_NOAA %>%
  mutate(year = year(t)) %>%
  filter(site %in% sites,
         year %in% c(2008, 2009)) %>%
  select(-year)

# First mangle
  # Normal tidy data
OISST1 <- OISST_tidy

# Second mangle
OISST2 <- OISST_tidy %>%
  pivot_wider(names_from = site, values_from = temp)

# Third mangle
OISST3 <- OISST_tidy %>%
  mutate(t = as.character(t),
         idx = 1:n()) %>% 
  pivot_longer(cols = c(site, t), names_to = "type", values_to = "name") %>% 
  dplyr::select(idx, type, name, temp)

## Fourth two part mangle
# A
OISST4a <- OISST_tidy %>%
  mutate(t = as.character(t)) %>%
  unite(index, site, t, sep = " ")

# B
OISST4b <- OISST_tidy %>%
  mutate(t = as.character(t),
         idx = 1:n()) %>%
  separate(col = t, into = c("year", "month", "day"), sep = "-") %>%
  select(-temp)

Save

Here we save all five of the newly mangled dataframes as one .RData object for ease of loading in the tutorial.

save(list = c("OISST1", "OISST2", "OISST3", "OISST4a", "OISST4b"), file = "course_material/data/OISST_mangled.RData")

Animations

In this supplemental tutorial we are going to look at how to create animations in R. To do this will require the installation of software outside of R. This software is ImageMagick and may be downloaded here: https://www.imagemagick.org/script/download.php. Once this software has been installed on your computer it will be necessary to install the animation library.

# The libraries required for this tut
library(tidyverse)
library(grid)
library(gridExtra)
library(animation)

Functions for creating ant walks

# Calculate speed based on u and v vectors
ant.speed <- function(df){
  df$x2 <- c(NA,df$x[2:nrow(df)] - df$x[1:(nrow(df)-1)])
  df$y2 <- c(NA,df$y[2:nrow(df)] - df$y[1:(nrow(df)-1)])
  speed_abs <- round(sqrt(df$x2^2 + df$y2^2),2)
  speed_abs[is.na(speed_abs)] <- 0
  return(speed_abs)
}

# Create a dataframe with desired number of ants and steps
ant.walk <- function(i,n){
  # Create the random walks
  walk_x <- c(0,round(cumsum(rnorm(n = n-1, mean = 0, sd = 1)),2))
  for(i in 2:i){
  x <- c(0,round(cumsum(rnorm(n = n-1, mean = 0, sd = 1)),2))
  walk_x <- c(walk_x, x)
  }
  walk_y <- c(0,round(cumsum(rnorm(n = n-1, mean = 0, sd = 1)),2))
  for(i in 2:i){
  y <- c(0,round(cumsum(rnorm(n = n-1, mean = 0, sd = 1)),2))
  walk_y <- c(walk_y, y)
  }
  # Create the walking dataframe
  walker <- data.frame(x = walk_x, y = walk_y, 
                       ant = as.factor(rep(1:i, each = n)), 
                       step =  rep(seq(1,n), i))
  walker$speed <- ant.speed(walker)
  walker$speed[walker$step == 1] <- 0
  return(walker)
}

Generate the ants

# Increase the second number for a longer animation
ants <- ant.walk(5, 20)

The function to animate the walk plot

walk.plot <- function(i){
  # Map figure
  walk_map <- ggplot(data = ants[ants$step %in% 1:i,], aes(x = x, y = y)) +
    geom_path(aes( group = ant), colour = "gray60") +
    geom_point(data = ants[ants$step == i,], aes(colour = ant))
  # Speed histogram
  walk_hist <- ggplot(data = ants[ants$step %in% 1:i,], aes(x = speed)) +
    geom_histogram() +
    labs(x = "speed")
  # Speed line graph
  walk_line <- ggplot(data = ants[ants$step %in% 1:i,], aes(x = step, y = speed)) +
    geom_line(aes(colour = ant))
  # Wack it together
  grid.arrange(walk_map, walk_hist, walk_line, layout_matrix = cbind(c(1,1), c(1,1), c(2,3)))
}

## Create animation of ts plots
animate.walk.plot <- function() {
  lapply(seq(1, max(ants$step)), function(i) {
    walk.plot(i)
  })
}

Render the GIF

# By default 'saveGIF()' outputs to the same folder as from where the script is.
# I have included the lines here to change your working directory and then change it back.
# Be careful here! This can be easy to cause issues.
setwd("course_material/")
system.time(saveGIF(animate.walk.plot(), interval = 0.2, 
                    ani.width = 800, movie.name = "ant_walk.gif")) ## ~60 seconds
setwd("../")

GIFs on plots

Yes my friends, it is true. We may add GIFs to our figures and maps. Rejoice. Better yet, the process is relatively straight forward. We will begin, as usual, by loading our libraries and files.

# Load libraries
library(tidyverse)
library(magick)

# The base image
background <- image_read("images/EU_flag.jpg") %>% # Load file
  image_scale("900") # Change resolution

# The gif to overlay
anim_overlay <- image_read("course_material/ant_walk.gif")  %>% # Load file 
  image_scale("300") # Change resolution

GIF creation

Once we have loaded our base image and the GIF we want to put on top of it we need to create a function to make these two different file types ‘kiss’. With the appropriately named magick package this is startlingly easy to do.

frames <- lapply(anim_overlay, function(frame) {
  image_composite(background, frame, offset = "+300")
})

GIF animation

With our function for creating the GIF sorted, it is now time to animate it!

animation <- image_animate(image_join(frames), fps = 10) # FPS = 10 is native speed

GIF save

Jip. Simple as that.

image_write(animation, "course_material/EU_ants.gif")

Morphing

Have you ever wanted to animate the transition from one figure to another? No? Me neither. But hey, it’s easy to do, so why not.

# Load libraries
library(magick)

# Load images
newlogo <- image_scale(image_read("https://www.r-project.org/logo/Rlogo.png"), "x150")
oldlogo <- image_scale(image_read("https://developer.r-project.org/Logo/Rlogo-3.png"), "x150")

Morph creation

morph_frames <- image_morph(c(oldlogo, newlogo), frames = 100)

Morph animation

morph_animate <- image_animate(morph_frames, fps = 20)

Morph save

image_write(morph_animate, "course_material/morph.gif")

Multivariate stats

To err is human, but to really foul things up you need a computer.

—Paul R. Ehrlich

In this brief tutorial we are going to walk through the steps necessary to perform a most basic ordination. We will be using MDS for this as it produces, in my opinion, the most straight forward results. There is of course an entire school of thought on this and I, a mere climate scientist, am in no way an authoritative voice on the matter.

# Load libraries
library(tidyverse)
library(ggpubr)
library(vegan)

# Load built-in data
data("dune")
data("dune.env")

MDS

MDS, or multi-dimensional scaling, is high level clustering technique. MDS allows us to determine which of the abiotic variables in our dataset are having the most pronounced effects on the clustering of the dunes. Running an MDS on a data frame in R is simple as the vegan package will do all of the heavy lifting for us. First we will jump straight in and run an MDS, then we will take a step back and try changing the standardisation of the values and the distance matrix that we would normally need to first calculate. Please consult the help file (?metaMDS) for details on the function.

dune_MDS_1 <- metaMDS(dune)

Or we may be more specific in the way in which we prepare our data for the MDS. Look through the help files to see what other options exist.

# Standardise data
dune_stand <- decostand(dune, method = "total")

# Create Bray-Curtis dissimilarity matrix
dune_dist <- vegdist(dune_stand, method = "bray")

# Create distance matrix
dune_MDS_2 <- metaMDS(dune_dist)

Stress

No, not that stress. We are talking about the stress of the MDS model now. This is an important value to check. If the stress is high (>0.3) the MDS model is doing a poor job of modeling the dissimilarities in the data. If it is low (<0.1) the model is doing a very good job of displaying the relationships within the data. To check the stress of our results we use the following line of code.

# Default MDS settings
dune_MDS_1$stress

# Determined settings
dune_MDS_2$stress

What is the stress of this model? Is that an acceptable level?

Basic biplot

With the MDS calculated, and the stress tested, it’s time to visualise the first round of results.

# Convert for ggplot
dune_MDS_points <- data.frame(site = 1:nrow(dune)) %>%
  mutate(x = as.numeric(dune_MDS_2$points[ ,1]),
         y = as.numeric(dune_MDS_2$points[ ,2]))

# Visualise with ggplot
ggplot(data = dune_MDS_points, aes(x = x, y = y)) +
  geom_point(size = 8, shape = 21, fill = "black", colour = "red") +
  geom_text(aes(label = site), colour = "white") +
  labs(x = "NMDS1", y = "NMDS2")

Fitting environmental variables

As with all of the other ordination analyses we have performed in R thus far, fitting environmental variables may also be done with one easy step. We do this by providing the envfit() function with a formula, the same as we do for linear models. The dependent variable (to the left of the ~) will be the results of the MDS on the species assemblage data, and the independent variables (to the right of the ~) are the columns from our environmental variables data frame.

dune_envfit <- envfit(dune_MDS_2 ~ Moisture + Use, data = dune.env)
dune_envfit

In the printout above we see the results for the R^2 (here r2) and p-values for the fit of each abiotic variable to the species assemblage data. Which relationships are significant? Which variable(s) appears to best explain the variance in the species assemblages? Which of the axes of the MDS have the strongest relationship with which variable?

To plot the results of our fitted abiotic variables on top of our species MDS we need to quickly prep it to play nice with ggplot2 and then we need only append a couple of lines onto the chunk we wrote to display our MDS results.

# Extract the envfit vector values
dune_envfit_df <- data.frame(dune_envfit$factors$centroids) %>%
  mutate(factors = row.names(.)) %>%
  rename(x = NMDS1, y = NMDS2)

# Visualise environmental fits
ggplot(data = dune_MDS_points, aes(x = x, y = y)) +
  geom_point(size = 8, shape = 21, fill = "black", colour = "red") +
  geom_text(aes(label = site), colour = "white") +
  geom_segment(data = dune_envfit_df, arrow = arrow(length = unit(0.25, "cm")),
               aes(x = 0, y = 0, xend = x, yend = y)) +
  geom_text(data = dune_envfit_df, colour = "red", 
            aes(x = x, y = y, label = factors)) +
  labs(x = "NMDS1", y = "NMDS2")

Adding clusters

In order to add clustering we must first create groupings for our data. In this instance we will be calculating our groups using hierarchical cluster analysis.

# Create dendrogram
  # Note that this must be run on a distance matrix
dune_clust <- hclust(dune_dist, "ward.D")

# Extract clusters
  # In this case we have decided on four clusters
dune_grp <- cutree(dune_clust, 4)

# Extract groups for plotting
dune_MDS_points <- dune_MDS_points %>% 
  mutate(grp_id = as.factor(dune_grp))

With the clusters calculated we may now plot ellipses on our biplot. We will first do this with the built-in functionality of ggplot2, which unfortunately isn’t great.

ggplot(data = dune_MDS_points, aes(x = x, y = y)) +
  geom_point(size = 8, shape = 21, fill = "black", colour = "red") +
  geom_text(aes(label = site), colour = "white") +
  geom_segment(data = dune_envfit_df, arrow = arrow(length = unit(0.25, "cm")),
               aes(x = 0, y = 0, xend = x, yend = y)) +
  geom_text(data = dune_envfit_df, colour = "red", 
            aes(x = x, y = y, label = factors)) +
  # The ellipses
  stat_ellipse(aes(colour = grp_id), type = "t") + 
  #
  labs(x = "NMDS1", y = "NMDS2", colour = "Cluster")

If we have very large datasets the ellipses will come more in line with what we want. With small datasets not so much. This is because the ellipses are actually calculating the area under which a certain confidence interval is maintained that the points in that group may be found. If we would rather use polygons to fit directly onto the area of our clusters we do so by replacing the ellipses with the following line of code.

ggplot(data = dune_MDS_points, aes(x = x, y = y)) +
  geom_point(size = 8, shape = 21, fill = "black", colour = "red") +
  geom_text(aes(label = site), colour = "white") +
  geom_segment(data = dune_envfit_df, arrow = arrow(length = unit(0.25, "cm")),
               aes(x = 0, y = 0, xend = x, yend = y)) +
  geom_text(data = dune_envfit_df, colour = "red", 
            aes(x = x, y = y, label = factors)) +
  # The custom made polygons
  stat_chull(geom = "polygon", aes(fill = grp_id), alpha = 0.4) +
  #
  labs(x = "NMDS1", y = "NMDS2")

I’m not super excited about that result either. A third option is to simply change the colour of the points to reflect their grouping.

ggplot(data = dune_MDS_points, aes(x = x, y = y)) +
  # Changing point aesthetics
  geom_point(size = 8, aes(colour = grp_id)) +
  #
  geom_text(aes(label = site), colour = "white") +
  geom_segment(data = dune_envfit_df, 
               aes(x = 0, y = 0, xend = x, yend = y)) +
  geom_text(data = dune_envfit_df, colour = "red",
            aes(label = factors)) +
  labs(x = "NMDS1", y = "NMDS2", colour = "Cluster")

I think this is actually the cleanest way to visualise the data.

Diversity

If we are interested in calculating a Shannon-Wiener index on the species diversity found within the dunes we need only one function.

diversity(dune)

ANOSIM

One final thing. It is also necessary to know if any differences exist between the clusters we have determined for our data. To do this we use the anosim() function from the vegan package.

anosim(dune_dist, dune_grp)