--- title: "3_Classification_WEST" author: "Jade Millot" date: "2025-01-16" output: html_document --- # Global description: In this script we identify and predict bioregions in the Western Mediterranean basin based on the composition in mega-invertebrates recorded during MEDITS surveys. Stations are classified using k-means clustering, bioregion distributions are predicted through a Random Forest (RF) model based on environmental conditions and indicator species are identified for each bioregion. # Clean space ```{r setup, include = FALSE} knitr::opts_chunk$set( eval = T, echo = TRUE, message = FALSE, warning = FALSE, cache = TRUE ) ``` # Library loading ```{r packages_analysis, echo = TRUE} ## General library(tidyverse) ## Correlation library(Hmisc) library(corrplot) library(gplots) ##Spatials library(rgdal) library(ggspatial) library(sp) library(sf) library(raster) library(here) library(terra) library(mapview) library(tmap) ## Models library(mgcv) library(performance) library(MuMIn) library(biomod2) library(blockCV) # Biodiversity analysis library(vegan) library(ade4) library(factoextra) library(cluster) library(betapart) library(matrixStats) library(indicspecies) library(ape) library(NbClust) # Plot library(gratia) library(ggrepel) library(RColorBrewer) library(formattable) library(htmltools) library(kableExtra) library(gt) library(mapedit) # Other library(udunits2) library(R.utils) library(scales) library(gridExtra) library(purrr) library(rlist) library(vegan) library(caret) library(caTools) library(randomForest) library(fmsb) # Extrapolation library(remotes) library(dsmextra) ``` # Functions ```{r} # Function to plot a broken stick model for a PCA # ----------------------------------------------- evplot = function(ev) { # Broken stick model (MacArthur 1957) n = length(ev) bsm = data.frame(j=seq(1:n), p=0) bsm$p[1] = 1/n for (i in 2:n) bsm$p[i] = bsm$p[i-1] + (1/(n + 1 - i)) bsm$p = 100*bsm$p/n # Plot eigenvalues and % of variation for each axis op = par(mfrow=c(2,1),omi=c(0.1,0.3,0.1,0.1), mar=c(1, 1, 1, 1)) barplot(ev, main="Eigenvalues", col="bisque", las=2) abline(h=mean(ev), col="red") legend("topright", "Average eigenvalue", lwd=1, col=2, bty="n") barplot(t(cbind(100*ev/sum(ev), bsm$p[n:1])), beside=TRUE, main="% variation", col=c("bisque",2), las=2) legend("topright", c("% eigenvalue", "Broken stick model"), pch=15, col=c("bisque",2), bty="n") par(op) } # Function to format a table from IndVal function (10 indicator taxa per bioregion) # ----------------------------------------------------------------- formatted_indVal <- function(indval_file = indval_ALL$sign, n_opt = 5) { T_indSp_list <- list() for (n in 1:n_opt){ set_indSp <- indval_file %>% rownames_to_column(var="species") %>% rowwise() %>% mutate(sum_index = sum(across(starts_with("s.")), na.rm = T)) %>% as.data.frame() %>% filter(.data[[colnames(.)[n+1]]] == 1 & sum_index == 1) %>% arrange(-stat) %>% ungroup() %>% dplyr::slice(1:10) %>% # only select the 10 first taxa dplyr::select(species) %>% mutate(clust = paste0("Bioregion_", n)) %>% rownames_to_column(var="index") T_indSp_list[[n]] <- set_indSp } T_indSp <- do.call("bind_rows", T_indSp_list) %>% pivot_wider(names_from = "clust", values_from = "species") %>% mutate_all(~ifelse(is.na(.), "-", .)) return (T_indSp) #print(kbl_indSp) } # Function to generate Response curves of RF models # ------------------------------------------------- generate_response_curve <- function(predictor, data, cluster_number, level, palette) { # Create a sequence of values for the predictor predictor_seq <- seq(min(data[[predictor]]), max(data[[predictor]]), length.out = 100) # Keep the other predictors to their mean predictors_mean <- data %>% dplyr::select(-cluster) %>% summarise_all(mean, na.rm = TRUE) %>% dplyr::select(-!!predictor) # Table pred_data <- as.data.frame(cbind(predictor_seq, predictors_mean)) %>% dplyr::rename(!!predictor := predictor_seq) # Use the model to predict probabilities for each class predicted_probs <- predict(RF, newdata = pred_data, type = "prob") # Convert predictions to long format for ggplot pred_long <- cbind(pred_data, predicted_probs) %>% pivot_longer(cols = tail(names(.), cluster_number), names_to = "cluster", values_to = "Probability") %>% mutate(cluster = factor(cluster, levels = level)) # Plot the response curve ggplot(pred_long, aes_string(x = predictor, y = "Probability", color = "cluster")) + geom_line(size = 1) + labs(x = predictor, y = "Predicted Probability") + scale_color_manual(values = palette) + theme_minimal() } ``` # Load data ```{r} # Path basin <- "WEST" path_raw <- "E:/PhD_2022_2023/Project/Bioregionalisation/R/" path_basin <- paste0(path_raw, basin) # Load load(paste0(path_raw,"Data/Community_matrix/BENT_", basin,".Rdata")) # Community matrix load(paste0(path_raw,"Data/Community_matrix/BENT_met_", basin,".Rdata")) # Metadata of MEDITS stations load(paste0(path_raw,"Data/Community_matrix/BENT_env_", basin,".Rdata")) # Environmental data of MEDITS stations env_stack <- rast(paste0(path_raw,"Data/Environment/env_stack_", basin,".tif")) # Stack of environmental variables env_stack[['substrate']] <- log(env_stack[['substrate']] + 1) # Log transform the substrate variable # Names com_mat_all <- BENT_WEST com_mat_met <- BENT_met_WEST com_mat_env <- BENT_env_WEST com_mat_hel_all <- decostand(com_mat_all, method = "hellinger") # Coastline Med sf_use_s2(FALSE) coastline_med <- st_read("E:/PhD_2022_2023/Project/SDM/R/Data/Country/land-polygons/coastline_med_rap.shp") # Costaline basin - extent coastline_basin <- coastline_med %>% st_crop(xmin=-6, xmax=19, ymin=34, ymax=45) # Mask for hard substrate to remove rock_filter <- st_read("E:/PhD_2022_2023/Project/SDM/R/Data/Environment/EUNIS/rock_filter.shp") # Mask of the Mediterranean above 1000m study_extent_med <- ext(-6,37,30,46) below_1000m <- rast(here::here("Data/Environment/Terrain/depth.tif")) %>% terra :: crop(study_extent_med) %>% clamp(upper=-1000, values = FALSE) ``` # Correlations of environmental variables ```{r} # Load environmental space of the Mediterranean background load(paste0(path_raw,"Data/Community_matrix/BENT_extract.Rdata")) # Select environmental variables com_mat_med_env <- BENT_extract %>% dplyr::select(-area, -year, -x, -y, -haul_id) %>% dplyr::select(-contains("ltmin")) %>% dplyr::select(-contains("ltmax")) %>% dplyr::select(-contains("swd")) %>% dplyr::select(-contains("range")) # Generate the correlation matrix cor_mat <- cor(com_mat_med_env, use = "pairwise.complete.obs", method = "spearman") col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA")) png(height= 1200, width=1400, file= paste0(path_raw, "Output/Clust_pred/cor_mat_met.png"), type = "cairo") corr_matrix_env_abs <- abs(cor_mat) corr_BO <- corrplot(cor_mat, method = "color", col = col(200), type = "upper", addCoef.col = "black", tl.col = "red", tl.srt = 45, p.mat = corr_matrix_env_abs, sig.level = 0.70, diag = FALSE, number.cex = 1, tl.cex = 1, cl.cex = 1, pch.cex = 4) dev.off() # First selection of environmental variables (1 keep for each pair with Spearman coefficient > 0.7) env_selection <- c("depth", "ph_mean", "si_mean", "current_speed_mean", "temp_mean", "salinity_mean", "substrate") # Generate a second correlation matrix with the selected variables to check cor_mat <- cor(subset(com_mat_med_env, select = env_selection), use = "pairwise.complete.obs", method = "spearman") col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA")) png(height= 1200, width=1400, file= paste0(path_raw, "Output/Clust_pred/cor_mat_met_final.png"), type = "cairo") corr_matrix_env_abs <- abs(cor_mat) corr_BO <- corrplot(cor_mat, method = "color", col = col(200), type = "upper", addCoef.col = "black", tl.col = "red", tl.srt = 45, p.mat = corr_matrix_env_abs, sig.level = 0.70, diag = FALSE, number.cex = 0.8, tl.cex = 1, cl.cex = 1, pch.cex = 4) dev.off() ``` # PCA: to summarize the community matrix ```{r} # Distance matrix com_mat_hel_all <- decostand(com_mat_all, method = "hellinger") # PCA pca_all <- dudi.pca(com_mat_hel_all, scannf = FALSE, nf = 40) # Generate eigenvalues evplot(pca_all[["eig"]]) # broken stick model pca_all[["eig"]] # Only stock axis wit eig > 2 n_opt_all <- 21 pca_scores_all <- pca_all$li[1:n_opt_all] # Vizualisation fviz_pca_ind(pca_all, label = "none", axes=c(1,2)) ``` # K-means: classification of stations with similar biotic composition ```{r} # Nb clust check com_mat_all_matrix <- as.matrix(pca_scores_all) fun <- NbClust(com_mat_all_matrix, distance = "euclidean", min.nc = 3, max.nc = 10, method = "kmeans") # Optimal number of clusters : check results of fun n_opt <- 5 # K-means : applied on PCA scores and using the optimal number of clusters kmeans_all <- kmeans(pca_scores_all, centers = n_opt, nstart = 100) # Quick map to vizualise the groups of stations colors_vector <- c("red", "green", "blue", "yellow", "purple", "orange", "pink", "brown", "grey", "cyan") kmeans_viz <- cbind(com_mat_met, kmeans_all['cluster']) %>% mutate(cluster = as.factor(cluster)) %>% st_as_sf(coords = c("x","y"), crs = 4326) tm_shape(kmeans_viz) + tm_symbols(col = "cluster", size = 0.7, palette = colors_vector) fviz_cluster(kmeans_all, data = pca_scores_all, palette = "Set3", geom = "point", ellipse.type = "convex", ggtheme = theme_bw() ) # Join data frames + Re-order groups numbers: need manual check kmeans_ord <- inner_join(com_mat_met, com_mat_env) %>% column_to_rownames("haul_id") %>% subset(., select = env_selection) %>% # Join metadata + environmental data cbind(kmeans_all['cluster']) %>% # Bind the vector of classification mutate(cluster = case_when( # Using the visualization map, renumber the groups as needed cluster == 1 ~ 2, cluster == 2 ~ 3, cluster == 3 ~ 5, cluster == 4 ~ 1, cluster == 5 ~ 4)) %>% mutate(cluster = as.factor(cluster)) %>% mutate(substrate = log(substrate + 1)) # Log transform substrate # Save cluster vector clust_vector <- kmeans_ord %>% dplyr::select(cluster) save(clust_vector, file = paste0(path_raw,"Output/Clust_pred/", basin, "/R_products/clust_vector.Rdata")) # Generate the final map of K-means classification colors <- c("#FFD700", "#FFA500", "#FF6347", "#FFC0CB", "#8B3A3A") kmeans_sf <- cbind(kmeans_ord, com_mat_met) %>% st_as_sf(coords = c("x","y"), crs = 4326) kmeans_map <- tm_shape(coastline_basin) + tm_polygons(col= "gray92", border.col = "black", lwd = 0.3) + tm_shape(kmeans_sf) + tm_symbols(col = "cluster", size = 0.3, palette = colors, border.col = NA, border.lwd = 0.6, sizes.legend = c(10)) + tm_graticules(labels.inside.frame = F, lines = F, labels.size = 1.5) + tm_legend(outside = TRUE, outside.position = "right", legend.show = TRUE) + tm_layout(main.title.size = 1.5, saturation = 1.5, frame = T, inner.margins = 0,legend.text.size = 1.2, legend.title.size = 1.3) kmeans_map # Save tmap_save(kmeans_map, paste0(path_raw, "Output/Clust_pred/", basin, "/kmeans_map.png"), width = 13, height = 9) ``` # Indicator species - Caution: the package gt, to plot table, work only if google chrome window is closed ```{r} # Apply IndVal function on the hellinger matrix and using the K-means classification indval <- multipatt(func = "IndVal.g", com_mat_hel_all, kmeans_ord[['cluster']], control = how(nperm=999)) # Format the table by extracted species names indval_table <- formatted_indVal(indval_file = indval$sign, n_opt) # Save save_kable(indval_table, file = paste0(path_raw,"Output/Clust_pred/", basin, "/indval.html"), self_contained = T) # Generate a clean table with the gt package gt <- indval_table %>% dplyr::select(-index) %>% gt() %>% cols_label(Bioregion_1 = md("**Bioregion 1**")) %>% cols_label(Bioregion_2 = md("**Bioregion 2**")) %>% cols_label(Bioregion_3 = md("**Bioregion 3**")) %>% cols_label(Bioregion_4 = md("**Bioregion 4**")) %>% cols_label(Bioregion_5 = md("**Bioregion 5**")) %>% cols_align(align = "center", columns = everything()) %>% tab_style( style = cell_fill(color = colors[1], alpha = 0.5), locations = cells_body(columns = Bioregion_1)) %>% tab_style( style = cell_fill(color = colors[2], alpha = 0.5), locations = cells_body(columns = Bioregion_2)) %>% tab_style( style = cell_fill(color = colors[3], alpha = 0.5), locations = cells_body(columns = Bioregion_3)) %>% tab_style( style = cell_fill(color = colors[4], alpha = 0.5), locations = cells_body(columns = Bioregion_4)) %>% tab_style( style = cell_fill(color = colors[5], alpha = 0.5), locations = cells_body(columns = Bioregion_5)) %>% tab_header( title = "Indicator taxa") %>% tab_options(table_body.hlines.color = "transparent") gt # Save: need to close Google Chrome windows gtsave(gt, filename = "ind_tab.png", path = paste0(path_raw,"Output/Clust_pred/", basin)) ``` # Random Forest predictions RF fitting: 500 trees by default but 1000, 10000 tree were tested and the performance was not increased. ```{r} # RF fitting # ---------- # Split in training and testing datasets set.seed(123) split <- sample.split(kmeans_ord, SplitRatio = 0.8) training_data <- subset(kmeans_ord, split == "TRUE") testing_data <- subset(kmeans_ord, split == "FALSE") # RF formula : cluster (factor variable) in function of environmental predictors from the training data RF <- randomForest(cluster~., data= training_data, ntree = 500) # Environmental variables contribution # ------------------------------------ # Format var_imp <- as.data.frame(varImpPlot(RF)) %>% rownames_to_column("predictors") %>% arrange(-MeanDecreaseGini) # Plot var_imp_plot <- ggplot(var_imp, aes(x = MeanDecreaseGini, y = reorder(predictors, MeanDecreaseGini))) + geom_point(size = 2, color = "black") + labs(x = "Gini Index", y = "Variables") + theme_bw()+ theme(axis.text.y = element_text(size = 12), axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 14), axis.title.y = element_text(size = 14)) var_imp_plot ggsave(paste0(path_raw,"Output/Clust_pred/", basin, "/var_importance.png"), width = 5, height = 5) # RF Evaluation # ------------- # RF Predictions on testing data (not known to calibrate the model) pred_test <- predict(RF, newdata = testing_data, type= "class") # Confusion matrix confusion_matrix <- confusionMatrix(table(pred_test,testing_data$cluster)) # Global accuracy of the RF model evaluation <- confusion_matrix[["overall"]][["Accuracy"]] print(evaluation) # => WEST: 0.89 # Generate an evaluation table with evaluation metrics per each class (i.e. How RF predicts well each bioregion) evaluation_table <- as.data.frame(confusion_matrix[["byClass"]]) %>% round(2) %>% rownames_to_column("Bioregion") %>% mutate(Bioregion = gsub("^Class: ", "", Bioregion)) %>% dplyr::select(Bioregion, Sensitivity, Specificity, Prevalence, 'Balanced Accuracy') %>% gt() %>% cols_align(align = "center", columns = everything()) # Statistics per class evaluation_table gtsave(evaluation_table, filename = "eval_table.png", path = paste0(path_raw,"Output/Clust_pred/", basin)) # RF Response curves # ------------------ # Generate response curves for each environmental predictor response_curves <- lapply(env_selection, generate_response_curve, data= training_data, cluster_number= n_opt, palette=colors, level = c(1:5)) # Change levels if numbers of bioregions has changed (here level = 1:5 because n_optimal = 5) # Arrange the plots in a grid response_curves_panel <- do.call(grid.arrange, c(response_curves, ncol = 2)) ggsave(paste0(path_raw,"Output/Clust_pred/", basin, "/response_curves.png"), response_curves_panel, height = 9, width = 8) # RF projection in the all sub-basin # ---------------------------------- # Stack of environmental variables env_stack_pred <- env_stack %>% subset(env_selection) # Loop to obtain the prediction of each cluster (.i.e : bioregion) pred_clust <- NULL map_clust <- NULL palette <- c("Blues", "Greens", "Reds", "Purples", "Greys") for (i in 1:n_opt) { # Prediction using RF model and stack of environmental variable in the basin pred <- terra::predict(model = RF, object = env_stack_pred, type="prob", index = i) pred_clust[[i]] <- pred # Map map <- tm_shape(coastline_basin) + tm_polygons(col="grey92", border.col = "black", lwd = 0.3) + tm_shape(pred, is.master = T) + tm_raster(style = "cont", breaks = seq(0, 1, by = 0.1), palette = palette[i], title = "probability", legend.reverse = T) + tm_legend(outside = TRUE, outside.position = "right", legend.show = TRUE) + tm_layout(main.title = paste0("Bioregion ", i), main.title.size = 1.5, saturation = 1.5, frame = T, inner.margins = 0,legend.text.size = 1.2, legend.title.size = 1.2) map_clust[[i]] <- map } # ------------------------------------ End # Arrange maps in one panel map_clust_panel <- tmap_arrange(map_clust) # Save tmap_save(map_clust_panel, paste0(path_raw,"Output/Clust_pred/", basin, "/map_pred_clust.png"), width = 13, height = 9) ``` # Ecological description of bioregions ```{r} # Diversity index # --------------- bio_div <- cbind(com_mat_all, kmeans_ord[['cluster']]) # Bind community matrix and k-means classification s_rich <- specnumber(com_mat_all, groups = kmeans_ord[['cluster']]) # Species richness div <- diversity(com_mat_all, index = "shannon", groups = kmeans_ord[['cluster']]) # Shannon pielou <- div/log(s_rich) # Pielou # Radar plot of diversity and environmental ranges # ------------------------------------------------ # Format dataframe radar_dataset <- kmeans_ord %>% group_by(cluster) %>% dplyr::summarise_all(mean) %>% ungroup() %>% mutate(cluster = paste0("Bioregion_", cluster)) %>% column_to_rownames("cluster") %>% mutate(depth = -depth) %>% cbind(s_rich) %>% cbind(pielou) # Take min and max of each variable min <- apply(radar_dataset, 2, min) max <- apply(radar_dataset, 2, max) radar_min_max <- rbind(max, min, radar_dataset) rownames(radar_min_max)[1] <- "min" rownames(radar_min_max)[2] <- "max" # Generate the Radar plots for each bioregion svg(paste0(path_raw, "Output/Clust_pred/", basin, "/radar_plot.svg"), height= 6, width=15) par(mfrow = c(1,5), mar = c(0, 1, 1, 1)) for (i in 1:n_opt){ radarchart(radar_min_max[c(1, 2, i+2),], # Customize the polygons pcol = colors[i], pfcol = scales::alpha(colors[i], 0.5), plwd = 1.5, plty = 1, # Customize the grid cglcol = "black", cglty = 1, cglwd = 0.8, # Customize the axis axislabcol = "black", # Variable labels vlcex = 1.2) } dev.off() par(mfrow = c(1,1)) ``` # Raster layers of bioregions ```{r} # Stack of cluster predictions pred_cluster_stack <- rast(pred_clust) names(pred_cluster_stack) <- c("clust_1","clust_2", "clust_3", "clust_4", "clust_5") # Transforme in dataframe pred_cluster_table <- as.data.frame(pred_cluster_stack, xy = TRUE) %>% pivot_longer(cols = c("clust_1","clust_2", "clust_3", "clust_4", "clust_5") , names_to = "clust", values_to = "value") %>% group_by(x, y) %>% filter(value == max(value)) %>% as.data.frame() %>% mutate(clust = as.numeric(str_remove(clust,"clust_"))) # Transform in raster layers pred_cluster_coord <- pred_cluster_table[,c(1,2)] %>% as.matrix() pred_cluster_distinct <- rasterize(pred_cluster_coord, pred_cluster_stack[[1]], values=pred_cluster_table$clust) terra::mask(rock_filter, inverse = TRUE) # remove hard substrate pixels # Write raster layers writeRaster(pred_cluster_distinct, paste0(path_raw, "Output/Clust_pred/", basin, "/R_products/raster_bioregion_",basin, ".tif"), overwrite = TRUE) ``` # Final map of bioregions ```{r} # Mask of no-data areas nodata_area <- st_read(here::here ("Data/Environment/Country/GSA/GSAs_simplified.shp")) %>% filter(SECT_COD %in% c("GSA03", "GSA04", "GSA12")) # Map map_cluster_dist <- tm_shape(coastline_basin) + tm_polygons(col="grey92", border.col = "black", lwd = 0.3) + tm_shape(pred_cluster_distinct, is.master = TRUE) + tm_raster(style = "cat", n = 10, palette = colors, alpha = 0.7, title = "Bioregion") + tm_shape(coastline_basin) + tm_polygons(col="grey92", border.col = "black", lwd = 0.3) + tm_shape(nodata_area) + tm_polygons(col="grey92", border.col = "grey", lwd = 0.3, alpha = 0.5) + tm_shape(below_1000m) + tm_raster(style = "cont", palette = "white", title = "",legend.show = FALSE) + tm_graticules(labels.inside.frame = F, lines = F, labels.size = 1.5) + tm_legend(outside = TRUE, outside.position = "right", legend.show = TRUE) + tm_layout(main.title = "", main.title.size = 1.5, saturation = 1, legend.text.size = 2, legend.title.size = 2, frame = T, inner.margins = 0) map_cluster_dist # Save tmap_save(map_cluster_dist, paste0(path_raw, "Output/Clust_pred/", basin, "/map_distinct_clust.png"), width = 13, height = 9) tmap_save(map_cluster_dist, paste0(path_raw, "Output/Clust_pred/", basin, "/map_distinct_clust.svg"), width = 13, height = 9) ``` # Environmental extrapolation + model residuals ```{r} # Environmental extrapolation # --------------------------- # Observed environment: environmental values for each MEDITS stations used in the training data observed_env <- training_data %>% dplyr::select(-cluster) # Environmental variables covariates <- c("depth", "ph_mean", "si_mean", "current_speed_mean", "temp_mean", "salinity_mean", "substrate") # Crds aftt_crs<- sp::CRS("+proj=longlat +datum=WGS84 +no_defs") # Predicted environment: all environmental background from Mediterranean basin predicted_env <- as.data.frame(env_stack, na.rm = TRUE, xy = TRUE) # Compute extrapolations using Exdet exdet <- compute_extrapolation(samples = observed_env, covariate.names = covariates, prediction.grid = predicted_env, coordinate.system = aftt_crs) univariate <- rast(exdet$rasters$ExDet$univariate) # unknown environmental conditions analogue <- rast(exdet$rasters$ExDet$analogue) # known environmental conditions # Save writeRaster(univariate, paste0(path_raw, "Output/Clust_pred/", basin, "/R_products/raster_univariate_",basin, ".tif"), overwrite = TRUE) writeRaster(analogue, paste0(path_raw, "Output/Clust_pred/", basin, "/R_products/raster_analogue_",basin, ".tif"), overwrite = TRUE) # Model residuals # --------------- sites_vect <- terra::vect(com_mat_met, geom = c("x","y"), crs = "epsg:4326") # Metadata of MEDITS stations cluster_pred <- terra::extract(pred_cluster_distinct, sites_vect) # Classification obtained by RF predictions # Calculation of residuals residuals <- cbind(com_mat_met, cluster_pred) %>% cbind(kmeans_ord['cluster']) %>% dplyr::rename(clust_obs = cluster, clust_pred = last) %>% dplyr::select(-ID, -haul_id) %>% # residual=1 => cluster observed (k-means) not equal to cluster predicted by RF mutate(residual = case_when(clust_obs != clust_pred ~ 1, TRUE ~ 0)) %>% mutate(residual = as.factor(residual)) # Transform points in spatial points residuals_sf <- residuals %>% st_as_sf(coords = c("x","y"), crs = 4326) # Save save(residuals, file = paste0(path_raw,"Output/Clust_pred/", basin, "/R_products/residuals.Rdata")) # Error map # --------- univariate <- rast(paste0(path_raw, "Output/Clust_pred/", basin, "/R_products/raster_univariate_",basin, ".tif")) analogue <- rast(paste0(path_raw, "Output/Clust_pred/", basin, "/R_products/raster_analogue_",basin, ".tif")) load(paste0(path_raw,"Output/Clust_pred/", basin, "/R_products/residuals.Rdata")) palette_orange <- brewer.pal(name="Oranges",n=9)[2:9] palette_green <- brewer.pal(name="Greens",n=9)[2:9] error_map <- tm_shape(coastline_basin) + tm_polygons(col="grey92", border.col = "black", lwd = 0.3) + tm_shape(univariate) + tm_raster(style = "cont", palette = palette_orange, title = "univariate") + tm_shape(analogue) + tm_raster(style = "cont", palette = palette_green, title = "analogue", legend.reverse = F) + tm_shape(residuals_sf) + tm_symbols(col = "residual", size = 0.1, palette = c("black","red"), border.col = NA, border.lwd = 0.1) + tm_graticules(labels.inside.frame = F, lines = F, labels.size = 1.5) + tm_legend(outside = TRUE, outside.position = "right", legend.show = TRUE) + tm_layout(main.title.size = 1.5, saturation = 1.5, frame = T, inner.margins = 0, legend.text.size = 1.2, legend.title.size = 1.3) error_map # Save tmap_save(error_map, paste0(path_raw, "Output/Clust_pred/", basin, "/error_map.png"), width = 13, height = 9) # Residuals percentages # --------------------- residuals_percentage <- residuals %>% group_by(clust_obs) %>% mutate(missclassif_perc = (sum(residual == 1) / n())*100) %>% # percentage of residuals among all MEDITS stations per cluster (i.e. bioregion) mutate(n=n()) %>% distinct(clust_obs, .keep_all = T) %>% mutate(goodclassif_perc = 100 - missclassif_perc) %>% dplyr::rename('misclassified' = missclassif_perc, 'well classified' = goodclassif_perc) %>% pivot_longer(cols = c('misclassified', 'well classified'), names_to = "Type", values_to = "classif_perc") # Save save(residuals_percentage, file = paste0(path_raw,"Output/Clust_pred/", basin, "/R_products/residuals_percentage.Rdata")) # Graph res_perc_plot <- ggplot(residuals_percentage, aes(x = clust_obs, y = classif_perc, fill = Type)) + geom_bar(stat = "identity", position = "stack") + labs(x = "Cluster", y = "Percentage (%)") + scale_fill_manual(values = c("salmon","grey83")) + theme_bw() + theme(axis.text.x = element_text(size = 20), axis.text.y = element_text(size = 20), axis.title = element_text(size = 20), legend.position = "bottom", legend.title = element_text(size = 20), legend.text = element_text(size = 20)) res_perc_plot # Save ggsave(paste0(path_raw, "Output/Clust_pred/", basin, "/misclassified_perc.png"), res_perc_plot, width = 8, height = 6) ```