---
title: "3_Classification_WEST"
author: "Jade Millot"
date: "2025-01-16"
output: html_document
---

# Global description:

In this script we identify and predict bioregions in the Western Mediterranean basin based on the composition in mega-invertebrates recorded during MEDITS surveys. Stations are classified using k-means clustering, bioregion distributions are predicted through a Random Forest (RF) model based on environmental conditions and indicator species are identified for each bioregion. 

# Clean space

```{r setup, include = FALSE}
knitr::opts_chunk$set(
	eval = T,
	echo = TRUE,
	message = FALSE,
	warning = FALSE,
	cache = TRUE
)
```

# Library loading

```{r packages_analysis, echo = TRUE}
## General
library(tidyverse)

## Correlation 
library(Hmisc)
library(corrplot)
library(gplots)

##Spatials
library(rgdal)
library(ggspatial)
library(sp)
library(sf)
library(raster)

library(here)
library(terra)
library(mapview)
library(tmap)

## Models
library(mgcv)
library(performance)
library(MuMIn)
library(biomod2)
library(blockCV)

# Biodiversity analysis
library(vegan)
library(ade4)
library(factoextra)
library(cluster)
library(betapart)
library(matrixStats)
library(indicspecies)
library(ape)
library(NbClust)

# Plot
library(gratia)
library(ggrepel)
library(RColorBrewer)
library(formattable)
library(htmltools)
library(kableExtra)
library(gt)
library(mapedit)

# Other
library(udunits2)
library(R.utils)
library(scales)
library(gridExtra)
library(purrr)
library(rlist)
library(vegan)
library(caret)
library(caTools)
library(randomForest)
library(fmsb)

# Extrapolation
library(remotes)
library(dsmextra)
```

# Functions

```{r}
# Function to plot a broken stick model for a PCA
# -----------------------------------------------

evplot = function(ev) {
  # Broken stick model (MacArthur 1957)
  n = length(ev)
  bsm = data.frame(j=seq(1:n), p=0)
  bsm$p[1] = 1/n
  for (i in 2:n) bsm$p[i] = bsm$p[i-1] + (1/(n + 1 - i))
  bsm$p = 100*bsm$p/n
  # Plot eigenvalues and % of variation for each axis
  op = par(mfrow=c(2,1),omi=c(0.1,0.3,0.1,0.1), mar=c(1, 1, 1, 1))
  barplot(ev, main="Eigenvalues", col="bisque", las=2)
  abline(h=mean(ev), col="red")
  legend("topright", "Average eigenvalue", lwd=1, col=2, bty="n")
  barplot(t(cbind(100*ev/sum(ev), bsm$p[n:1])), beside=TRUE, 
          main="% variation", col=c("bisque",2), las=2)
  legend("topright", c("% eigenvalue", "Broken stick model"), 
         pch=15, col=c("bisque",2), bty="n")
  par(op)
}

# Function to format a table from IndVal function (10 indicator taxa per bioregion)
# -----------------------------------------------------------------

formatted_indVal <- function(indval_file = indval_ALL$sign, n_opt = 5) {
  
T_indSp_list <- list()

for (n in 1:n_opt){
  
  set_indSp <- indval_file %>% 
  rownames_to_column(var="species") %>%
  rowwise() %>% 
  mutate(sum_index = sum(across(starts_with("s.")), na.rm = T)) %>% as.data.frame() %>% 
  filter(.data[[colnames(.)[n+1]]] == 1 & sum_index == 1) %>% 
  arrange(-stat) %>% ungroup() %>% 
  dplyr::slice(1:10) %>% # only select the 10 first taxa
  dplyr::select(species) %>% 
  mutate(clust = paste0("Bioregion_", n)) %>% 
  rownames_to_column(var="index")

T_indSp_list[[n]] <- set_indSp
}

T_indSp <- do.call("bind_rows", T_indSp_list) %>% 
  pivot_wider(names_from = "clust", values_from = "species") %>% 
  mutate_all(~ifelse(is.na(.), "-", .))

return (T_indSp)

#print(kbl_indSp)

}

# Function to generate Response curves of RF models
# -------------------------------------------------

generate_response_curve <- function(predictor, data, cluster_number, level, palette) {

  # Create a sequence of values for the predictor
  predictor_seq <- seq(min(data[[predictor]]), max(data[[predictor]]), length.out = 100)
  
  # Keep the other predictors to their mean
  predictors_mean <- data %>% dplyr::select(-cluster) %>% summarise_all(mean, na.rm = TRUE) %>%   dplyr::select(-!!predictor)
  
  # Table
  pred_data <- as.data.frame(cbind(predictor_seq, predictors_mean)) %>% dplyr::rename(!!predictor := predictor_seq)
  
  # Use the model to predict probabilities for each class
  predicted_probs <- predict(RF, newdata = pred_data, type = "prob")
  
  # Convert predictions to long format for ggplot
  pred_long <- cbind(pred_data, predicted_probs) %>%
    pivot_longer(cols = tail(names(.), cluster_number), names_to = "cluster", values_to = "Probability") %>% 
    mutate(cluster = factor(cluster, levels = level))
  
  # Plot the response curve
  ggplot(pred_long, aes_string(x = predictor, y = "Probability", color = "cluster")) +
    geom_line(size = 1) +
    labs(x = predictor,
         y = "Predicted Probability") +
    scale_color_manual(values = palette) + 
    theme_minimal()
}


```

# Load data

```{r}

# Path
basin <- "WEST" 
path_raw <- "E:/PhD_2022_2023/Project/Bioregionalisation/R/"
path_basin <- paste0(path_raw, basin)

# Load 
load(paste0(path_raw,"Data/Community_matrix/BENT_", basin,".Rdata")) # Community matrix 
load(paste0(path_raw,"Data/Community_matrix/BENT_met_", basin,".Rdata")) # Metadata of MEDITS stations
load(paste0(path_raw,"Data/Community_matrix/BENT_env_", basin,".Rdata")) # Environmental data of MEDITS stations 
env_stack <- rast(paste0(path_raw,"Data/Environment/env_stack_", basin,".tif")) # Stack of environmental variables
env_stack[['substrate']] <- log(env_stack[['substrate']] + 1) # Log transform the substrate variable

# Names
com_mat_all <- BENT_WEST
com_mat_met <- BENT_met_WEST
com_mat_env <- BENT_env_WEST
com_mat_hel_all <- decostand(com_mat_all, method = "hellinger")

# Coastline Med
sf_use_s2(FALSE)
coastline_med <- st_read("E:/PhD_2022_2023/Project/SDM/R/Data/Country/land-polygons/coastline_med_rap.shp")

# Costaline basin - extent
coastline_basin <- coastline_med %>% 
  st_crop(xmin=-6, xmax=19, ymin=34, ymax=45)

# Mask for hard substrate to remove
rock_filter <- st_read("E:/PhD_2022_2023/Project/SDM/R/Data/Environment/EUNIS/rock_filter.shp")

# Mask of the Mediterranean above 1000m 
study_extent_med <- ext(-6,37,30,46)
below_1000m <- rast(here::here("Data/Environment/Terrain/depth.tif")) %>% 
  terra :: crop(study_extent_med) %>% 
  clamp(upper=-1000, values = FALSE)

```

# Correlations of environmental variables

```{r}

# Load environmental space of the Mediterranean background
load(paste0(path_raw,"Data/Community_matrix/BENT_extract.Rdata"))

# Select environmental variables
com_mat_med_env <- BENT_extract %>% 
  dplyr::select(-area, -year, -x, -y, -haul_id) %>% 
  dplyr::select(-contains("ltmin")) %>% 
  dplyr::select(-contains("ltmax")) %>% 
  dplyr::select(-contains("swd")) %>% 
  dplyr::select(-contains("range"))

# Generate the correlation matrix
cor_mat <- cor(com_mat_med_env, use = "pairwise.complete.obs", method = "spearman")
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
png(height= 1200, width=1400, file= paste0(path_raw, "Output/Clust_pred/cor_mat_met.png"), type = "cairo")
corr_matrix_env_abs <- abs(cor_mat)
corr_BO <- corrplot(cor_mat, method = "color", col = col(200),  
         type = "upper", addCoef.col = "black", 
         tl.col = "red", tl.srt = 45,
         p.mat = corr_matrix_env_abs, sig.level = 0.70,
         diag = FALSE,
         number.cex = 1, tl.cex = 1, cl.cex = 1, pch.cex = 4)
dev.off()

# First selection of environmental variables (1 keep for each pair with Spearman coefficient > 0.7)
env_selection <- c("depth", "ph_mean", "si_mean", "current_speed_mean", "temp_mean", "salinity_mean", "substrate")

# Generate a second correlation matrix with the selected variables to check 
cor_mat <- cor(subset(com_mat_med_env, select = env_selection), use = "pairwise.complete.obs", method = "spearman")
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))

png(height= 1200, width=1400, file= paste0(path_raw, "Output/Clust_pred/cor_mat_met_final.png"), type = "cairo")
corr_matrix_env_abs <- abs(cor_mat)
corr_BO <- corrplot(cor_mat, method = "color", col = col(200),  
         type = "upper", addCoef.col = "black", 
         tl.col = "red", tl.srt = 45,
         p.mat = corr_matrix_env_abs, sig.level = 0.70,
         diag = FALSE,
         number.cex = 0.8, tl.cex = 1, cl.cex = 1, pch.cex = 4)
dev.off()

```

# PCA: to summarize the community matrix

```{r}

# Distance matrix
com_mat_hel_all <- decostand(com_mat_all, method = "hellinger")

# PCA
pca_all <- dudi.pca(com_mat_hel_all, scannf = FALSE, nf = 40)

# Generate eigenvalues
evplot(pca_all[["eig"]]) # broken stick model
pca_all[["eig"]]

# Only stock axis wit eig > 2 
n_opt_all <- 21 
pca_scores_all <- pca_all$li[1:n_opt_all]

# Vizualisation
fviz_pca_ind(pca_all, label = "none", axes=c(1,2))

```

# K-means: classification of stations with similar biotic composition

```{r}
# Nb clust check
com_mat_all_matrix <- as.matrix(pca_scores_all)
fun <- NbClust(com_mat_all_matrix, distance = "euclidean", min.nc = 3, max.nc = 10, method = "kmeans")

# Optimal number of clusters : check results of fun
n_opt <- 5 

# K-means : applied on PCA scores and using the optimal number of clusters
kmeans_all <- kmeans(pca_scores_all, centers = n_opt, nstart = 100)

# Quick map to vizualise the groups of stations
colors_vector <- c("red", "green", "blue", "yellow", "purple", "orange", "pink", "brown", "grey", "cyan")
kmeans_viz <- cbind(com_mat_met, kmeans_all['cluster']) %>% mutate(cluster = as.factor(cluster)) %>%  st_as_sf(coords = c("x","y"), crs = 4326)
tm_shape(kmeans_viz) + tm_symbols(col = "cluster", size = 0.7, palette = colors_vector)

fviz_cluster(kmeans_all, data = pca_scores_all,
             palette = "Set3", 
             geom = "point",
             ellipse.type = "convex", 
             ggtheme = theme_bw()
             )

# Join data frames + Re-order groups numbers: need manual check

kmeans_ord <- inner_join(com_mat_met, com_mat_env) %>% column_to_rownames("haul_id") %>% subset(., select = env_selection) %>% # Join metadata + environmental data
  cbind(kmeans_all['cluster']) %>% # Bind the vector of classification
  mutate(cluster = case_when( # Using the visualization map, renumber the groups as needed
    cluster == 1 ~ 2, 
    cluster == 2 ~ 3, 
    cluster == 3 ~ 5,
    cluster == 4 ~ 1,
    cluster == 5 ~ 4)) %>% 
mutate(cluster = as.factor(cluster)) %>% 
mutate(substrate = log(substrate + 1)) # Log transform substrate

# Save cluster vector
clust_vector <- kmeans_ord %>% dplyr::select(cluster)
save(clust_vector, file = paste0(path_raw,"Output/Clust_pred/", basin, "/R_products/clust_vector.Rdata"))

# Generate the final map of K-means classification

colors <- c("#FFD700", "#FFA500", "#FF6347", "#FFC0CB", "#8B3A3A")

kmeans_sf <- cbind(kmeans_ord, com_mat_met) %>% 
  st_as_sf(coords = c("x","y"), crs = 4326)

kmeans_map <- tm_shape(coastline_basin) + tm_polygons(col= "gray92", border.col = "black", lwd = 0.3) +
  tm_shape(kmeans_sf) + tm_symbols(col = "cluster", size = 0.3, palette = colors, border.col = NA,  border.lwd = 0.6, sizes.legend = c(10)) + 
  tm_graticules(labels.inside.frame = F, lines = F, labels.size = 1.5) +
  tm_legend(outside = TRUE, outside.position = "right", legend.show = TRUE) +
  tm_layout(main.title.size = 1.5, saturation = 1.5, frame = T, inner.margins = 0,legend.text.size = 1.2, legend.title.size = 1.3)
kmeans_map

# Save
tmap_save(kmeans_map, paste0(path_raw, "Output/Clust_pred/", basin, "/kmeans_map.png"), width = 13, height = 9)
```

# Indicator species

- Caution: the package gt, to plot table, work only if google chrome window is closed

```{r}
# Apply IndVal function on the hellinger matrix and using the K-means classification 
indval <- multipatt(func = "IndVal.g", com_mat_hel_all, kmeans_ord[['cluster']], control = how(nperm=999))

# Format the table by extracted species names
indval_table <- formatted_indVal(indval_file = indval$sign, n_opt) 

# Save
save_kable(indval_table, file = paste0(path_raw,"Output/Clust_pred/", basin, "/indval.html"), self_contained = T)

# Generate a clean table with the gt package

gt <- indval_table %>% dplyr::select(-index) %>% gt() %>% 
  cols_label(Bioregion_1 = md("**Bioregion 1**")) %>% 
  cols_label(Bioregion_2 = md("**Bioregion 2**")) %>% 
  cols_label(Bioregion_3 = md("**Bioregion 3**")) %>%
  cols_label(Bioregion_4 = md("**Bioregion 4**")) %>%
  cols_label(Bioregion_5 = md("**Bioregion 5**")) %>%
  cols_align(align = "center", columns = everything()) %>% 
  tab_style(
    style = cell_fill(color = colors[1], alpha = 0.5),
    locations = cells_body(columns = Bioregion_1)) %>% 
   tab_style(
    style = cell_fill(color = colors[2], alpha = 0.5),
    locations = cells_body(columns = Bioregion_2)) %>% 
  tab_style(
    style = cell_fill(color = colors[3], alpha = 0.5),
    locations = cells_body(columns = Bioregion_3)) %>%
  tab_style(
     style = cell_fill(color = colors[4], alpha = 0.5),
     locations = cells_body(columns = Bioregion_4)) %>%
   tab_style(
     style = cell_fill(color = colors[5], alpha = 0.5),
     locations = cells_body(columns = Bioregion_5)) %>%
  tab_header( title = "Indicator taxa") %>%
  tab_options(table_body.hlines.color = "transparent")
gt 

# Save: need to close Google Chrome windows
gtsave(gt, filename = "ind_tab.png",  path = paste0(path_raw,"Output/Clust_pred/", basin))
```

# Random Forest predictions

RF fitting: 500 trees by default but 1000, 10000 tree were tested and the performance was not increased. 

```{r}
# RF fitting
# ----------

# Split in training and testing datasets
set.seed(123)
split <- sample.split(kmeans_ord, SplitRatio = 0.8) 
training_data <- subset(kmeans_ord, split == "TRUE")
testing_data <- subset(kmeans_ord, split == "FALSE")

# RF formula : cluster (factor variable) in function of environmental predictors from the training data
RF <- randomForest(cluster~., data= training_data, ntree = 500)

# Environmental variables contribution
# ------------------------------------

# Format
var_imp <- as.data.frame(varImpPlot(RF)) %>% rownames_to_column("predictors") %>% 
  arrange(-MeanDecreaseGini)

# Plot
var_imp_plot <- ggplot(var_imp, aes(x = MeanDecreaseGini, y = reorder(predictors, MeanDecreaseGini))) +
  geom_point(size = 2, color = "black") +
  labs(x = "Gini Index",
       y = "Variables") +
  theme_bw()+ 
  theme(axis.text.y = element_text(size = 12), axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14))
var_imp_plot

ggsave(paste0(path_raw,"Output/Clust_pred/", basin, "/var_importance.png"), width = 5, height = 5)

# RF Evaluation
# -------------

# RF Predictions on testing data (not known to calibrate the model)
pred_test <- predict(RF, newdata = testing_data, type= "class")

# Confusion matrix 
confusion_matrix <- confusionMatrix(table(pred_test,testing_data$cluster)) 

# Global accuracy of the RF model
evaluation <- confusion_matrix[["overall"]][["Accuracy"]]
print(evaluation)
# => WEST: 0.89

# Generate an evaluation table with evaluation metrics per each class (i.e. How RF predicts well each bioregion)
evaluation_table <- as.data.frame(confusion_matrix[["byClass"]]) %>% round(2) %>% rownames_to_column("Bioregion") %>% 
  mutate(Bioregion = gsub("^Class: ", "", Bioregion)) %>% 
  dplyr::select(Bioregion, Sensitivity, Specificity, Prevalence, 'Balanced Accuracy') %>% gt() %>% 
  cols_align(align = "center", columns = everything()) # Statistics per class
evaluation_table
gtsave(evaluation_table, filename = "eval_table.png",  path = paste0(path_raw,"Output/Clust_pred/", basin))

# RF Response curves
# ------------------

# Generate response curves for each environmental predictor

response_curves <- lapply(env_selection, generate_response_curve, data= training_data, cluster_number= n_opt, palette=colors, level = c(1:5)) # Change levels if numbers of bioregions has changed (here level = 1:5 because n_optimal = 5)

# Arrange the plots in a grid
response_curves_panel <- do.call(grid.arrange, c(response_curves, ncol = 2))
ggsave(paste0(path_raw,"Output/Clust_pred/", basin, "/response_curves.png"), response_curves_panel, height = 9, width = 8)

# RF projection in the all sub-basin
# ----------------------------------

# Stack of environmental variables
env_stack_pred <- env_stack %>% subset(env_selection)

# Loop to obtain the prediction of each cluster (.i.e : bioregion)

pred_clust <- NULL
map_clust <- NULL
palette <- c("Blues", "Greens", "Reds", "Purples", "Greys")

for (i in 1:n_opt) {

# Prediction using RF model and stack of environmental variable in the basin
pred <- terra::predict(model = RF, object = env_stack_pred, type="prob", index = i) 
pred_clust[[i]] <- pred

# Map
map <- tm_shape(coastline_basin) + tm_polygons(col="grey92", border.col = "black", lwd = 0.3) +  
  tm_shape(pred, is.master = T) + 
  tm_raster(style = "cont", breaks = seq(0, 1, by = 0.1), palette = palette[i], title = "probability", legend.reverse = T) + 
  tm_legend(outside = TRUE, outside.position = "right", legend.show = TRUE) +
  tm_layout(main.title = paste0("Bioregion ", i), main.title.size = 1.5, saturation = 1.5, frame = T, inner.margins = 0,legend.text.size = 1.2, legend.title.size = 1.2)

map_clust[[i]] <- map

}

# ------------------------------------ End 

# Arrange maps in one panel
map_clust_panel <- tmap_arrange(map_clust)

# Save
tmap_save(map_clust_panel, paste0(path_raw,"Output/Clust_pred/", basin, "/map_pred_clust.png"), width = 13, height = 9)

```

# Ecological description of bioregions

```{r}
# Diversity index
# ---------------

bio_div <- cbind(com_mat_all, kmeans_ord[['cluster']]) # Bind community matrix and k-means classification
s_rich <- specnumber(com_mat_all, groups = kmeans_ord[['cluster']]) # Species richness
div <- diversity(com_mat_all, index = "shannon", groups = kmeans_ord[['cluster']]) # Shannon
pielou <- div/log(s_rich) # Pielou
  
# Radar plot of diversity and environmental ranges
# ------------------------------------------------

# Format dataframe
radar_dataset <- kmeans_ord %>% group_by(cluster) %>% dplyr::summarise_all(mean) %>% ungroup() %>% 
  mutate(cluster = paste0("Bioregion_", cluster)) %>% column_to_rownames("cluster") %>% 
  mutate(depth = -depth) %>% 
  cbind(s_rich) %>% 
  cbind(pielou)

# Take min and max of each variable
min <- apply(radar_dataset, 2, min)
max <- apply(radar_dataset, 2, max)

radar_min_max <- rbind(max, min, radar_dataset) 
rownames(radar_min_max)[1] <- "min"
rownames(radar_min_max)[2] <- "max"

# Generate the Radar plots for each bioregion

svg(paste0(path_raw, "Output/Clust_pred/", basin, "/radar_plot.svg"), height= 6, width=15)
par(mfrow = c(1,5), mar = c(0, 1, 1, 1))

for (i in 1:n_opt){
radarchart(radar_min_max[c(1, 2, i+2),],
                    # Customize the polygons
                    pcol = colors[i], pfcol = scales::alpha(colors[i], 0.5), plwd = 1.5, plty = 1,
                    # Customize the grid
                    cglcol = "black", cglty = 1, cglwd = 0.8,
                    # Customize the axis
                    axislabcol = "black",
                    # Variable labels
                    vlcex = 1.2)
}

dev.off()
par(mfrow = c(1,1))
```

# Raster layers of bioregions

```{r}
# Stack of cluster predictions
pred_cluster_stack <- rast(pred_clust)
names(pred_cluster_stack) <- c("clust_1","clust_2", "clust_3", "clust_4", "clust_5")

# Transforme in dataframe
pred_cluster_table <- as.data.frame(pred_cluster_stack, xy = TRUE) %>% 
  pivot_longer(cols = c("clust_1","clust_2", "clust_3", "clust_4", "clust_5") , names_to = "clust", values_to = "value") %>% 
  group_by(x, y) %>% 
  filter(value == max(value)) %>% as.data.frame() %>% 
  mutate(clust = as.numeric(str_remove(clust,"clust_")))

# Transform in raster layers
pred_cluster_coord <- pred_cluster_table[,c(1,2)] %>%  as.matrix()
pred_cluster_distinct <- rasterize(pred_cluster_coord, pred_cluster_stack[[1]], values=pred_cluster_table$clust)
  terra::mask(rock_filter, inverse = TRUE) # remove hard substrate pixels

# Write raster layers
writeRaster(pred_cluster_distinct, paste0(path_raw, "Output/Clust_pred/", basin, "/R_products/raster_bioregion_",basin, ".tif"), overwrite = TRUE)
```

# Final map of bioregions

```{r}
# Mask of no-data areas
nodata_area <- st_read(here::here ("Data/Environment/Country/GSA/GSAs_simplified.shp")) %>% 
    filter(SECT_COD %in% c("GSA03", "GSA04", "GSA12"))

# Map
map_cluster_dist <- tm_shape(coastline_basin) + tm_polygons(col="grey92", border.col = "black", lwd = 0.3) + 
  tm_shape(pred_cluster_distinct, is.master = TRUE) +
  tm_raster(style = "cat", n = 10, palette = colors, alpha = 0.7, title = "Bioregion") +
  tm_shape(coastline_basin) + tm_polygons(col="grey92", border.col = "black", lwd = 0.3) +
  tm_shape(nodata_area) + tm_polygons(col="grey92", border.col = "grey", lwd = 0.3, alpha = 0.5) +
  tm_shape(below_1000m) + tm_raster(style = "cont", palette = "white", title = "",legend.show = FALSE) +
  tm_graticules(labels.inside.frame = F, lines = F, labels.size = 1.5) +
  tm_legend(outside = TRUE, outside.position = "right", legend.show = TRUE) +
  tm_layout(main.title = "", main.title.size = 1.5, saturation = 1, legend.text.size = 2, legend.title.size = 2, frame = T, inner.margins = 0) 
map_cluster_dist

# Save
tmap_save(map_cluster_dist, paste0(path_raw, "Output/Clust_pred/", basin, "/map_distinct_clust.png"), width = 13, height = 9)
tmap_save(map_cluster_dist, paste0(path_raw, "Output/Clust_pred/", basin, "/map_distinct_clust.svg"), width = 13, height = 9)
```

# Environmental extrapolation + model residuals

```{r}
# Environmental extrapolation
# ---------------------------

# Observed environment: environmental values for each MEDITS stations used in the training data
observed_env <- training_data %>% dplyr::select(-cluster)

# Environmental variables
covariates <- c("depth", "ph_mean", "si_mean", "current_speed_mean", "temp_mean", "salinity_mean", "substrate")

# Crds
aftt_crs<- sp::CRS("+proj=longlat +datum=WGS84 +no_defs")

# Predicted environment: all environmental background from Mediterranean basin 
predicted_env <- as.data.frame(env_stack, na.rm = TRUE, xy = TRUE)

# Compute extrapolations using Exdet
exdet <- compute_extrapolation(samples = observed_env, covariate.names = covariates, prediction.grid = predicted_env, coordinate.system = aftt_crs)

univariate <- rast(exdet$rasters$ExDet$univariate) # unknown environmental conditions
analogue <- rast(exdet$rasters$ExDet$analogue) # known environmental conditions

# Save
writeRaster(univariate, paste0(path_raw, "Output/Clust_pred/", basin, "/R_products/raster_univariate_",basin, ".tif"), overwrite = TRUE)
writeRaster(analogue, paste0(path_raw, "Output/Clust_pred/", basin, "/R_products/raster_analogue_",basin, ".tif"), overwrite = TRUE)

# Model residuals
# ---------------

sites_vect <- terra::vect(com_mat_met, geom = c("x","y"), crs = "epsg:4326") # Metadata of MEDITS stations
cluster_pred <- terra::extract(pred_cluster_distinct, sites_vect) # Classification obtained by RF predictions

# Calculation of residuals
residuals <- cbind(com_mat_met, cluster_pred) %>% 
  cbind(kmeans_ord['cluster']) %>% 
  dplyr::rename(clust_obs = cluster, clust_pred = last) %>% 
  dplyr::select(-ID, -haul_id) %>% 
  # residual=1 => cluster observed (k-means) not equal to cluster predicted by RF
  mutate(residual = case_when(clust_obs != clust_pred ~ 1,
                              TRUE ~ 0)) %>%
  mutate(residual = as.factor(residual))

# Transform points in spatial points
residuals_sf <- residuals %>% st_as_sf(coords = c("x","y"), crs = 4326)

# Save
save(residuals, file = paste0(path_raw,"Output/Clust_pred/", basin, "/R_products/residuals.Rdata"))

# Error map
# ---------
univariate <- rast(paste0(path_raw, "Output/Clust_pred/", basin, "/R_products/raster_univariate_",basin, ".tif"))
analogue <- rast(paste0(path_raw, "Output/Clust_pred/", basin, "/R_products/raster_analogue_",basin, ".tif"))
load(paste0(path_raw,"Output/Clust_pred/", basin, "/R_products/residuals.Rdata"))

palette_orange <- brewer.pal(name="Oranges",n=9)[2:9]
palette_green <- brewer.pal(name="Greens",n=9)[2:9]

error_map <- tm_shape(coastline_basin) + tm_polygons(col="grey92", border.col = "black", lwd = 0.3) +
  tm_shape(univariate) + 
  tm_raster(style = "cont", palette = palette_orange, title = "univariate") + 
  tm_shape(analogue) +
  tm_raster(style = "cont", palette = palette_green, title = "analogue", legend.reverse = F) +
  tm_shape(residuals_sf) + tm_symbols(col = "residual", size = 0.1, palette = c("black","red"), border.col = NA,  border.lwd = 0.1) +
  tm_graticules(labels.inside.frame = F, lines = F, labels.size = 1.5) +
  tm_legend(outside = TRUE, outside.position = "right", legend.show = TRUE) +
  tm_layout(main.title.size = 1.5, saturation = 1.5, frame = T, inner.margins = 0, legend.text.size = 1.2, legend.title.size = 1.3)
error_map

# Save 
tmap_save(error_map, paste0(path_raw, "Output/Clust_pred/", basin, "/error_map.png"), width = 13, height = 9)

# Residuals percentages
# ---------------------

residuals_percentage <- residuals %>% 
  group_by(clust_obs) %>% 
  mutate(missclassif_perc = (sum(residual == 1) / n())*100) %>% # percentage of residuals among all MEDITS stations per cluster (i.e. bioregion)
  mutate(n=n()) %>% 
  distinct(clust_obs, .keep_all = T) %>% 
  mutate(goodclassif_perc = 100 - missclassif_perc) %>% 
  dplyr::rename('misclassified' = missclassif_perc, 'well classified' = goodclassif_perc) %>% 
  pivot_longer(cols = c('misclassified', 'well classified'), names_to = "Type", values_to = "classif_perc")

# Save
save(residuals_percentage, file = paste0(path_raw,"Output/Clust_pred/", basin, "/R_products/residuals_percentage.Rdata"))

# Graph 
res_perc_plot <- ggplot(residuals_percentage, aes(x = clust_obs, y = classif_perc, fill = Type)) +
  geom_bar(stat = "identity", position = "stack") +
  labs(x = "Cluster",
       y = "Percentage (%)") +
  scale_fill_manual(values = c("salmon","grey83")) +
  theme_bw() +
  theme(axis.text.x = element_text(size = 20), axis.text.y = element_text(size = 20), axis.title = element_text(size = 20), legend.position = "bottom", legend.title = element_text(size = 20), legend.text = element_text(size = 20))
res_perc_plot

# Save
ggsave(paste0(path_raw, "Output/Clust_pred/", basin, "/misclassified_perc.png"), res_perc_plot, width = 8, height = 6)
```