--- title: "0_Megabenthos_data_formatting" author: "Jade Millot" date: "2025-01-16" output: html_document --- # Global description: In this script, we use megabenthos records from MEDITS surveys. We format the community matrices for each sub-basin and we build stack of environmental variables per sub-basin to perform predictions in the following scripts. # Clean space ```{r setup, include = FALSE} knitr::opts_chunk$set( eval = T, echo = TRUE, message = FALSE, warning = FALSE, cache = TRUE ) ``` # Library loading ```{r packages_analysis, echo = TRUE} ## General library(tidyverse) ## Correlation library(Hmisc) library(corrplot) library(gplots) library(ggvenn) ##Spatials library(rgdal) library(ggspatial) library(sp) library(sf) library(raster) library(here) library(terrainr) library(terra) library(mapview) library(tmap) library(rworldmap) ## Models library(mgcv) library(performance) library(MuMIn) # Biodiversity analysis library(vegan) library(ade4) library(factoextra) library(cluster) library(betapart) library(matrixStats) library(indicspecies) library(ape) # Plot library(cowplot) library(ggpubr) library(ggeffects) library(gratia) library(ggrepel) library(VennDiagram) # Other library(udunits2) library(R.utils) library(scales) ``` # Load Megabenthos MEDITS dataset No public data ```{r} load(here::here("Data/MEDITS_data/MEDITS_UE_benthos/MED_BENT.Rdata")) ``` # Clean data - filter rare taxa - Remove inconsistent data from some region or period - Rare taxa filter: Taxa were grouped at a higher taxonomic level if they were recorded in less than 60% of the years per GSA, or if they accounted for less than 5% of all stations per GSA and year. ```{r} # Dataset formatting # ------------------ MED_BENT_filter <- MED_BENT %>% # Remove Crete data (GSA23) + Cyprus data (GSA25) : no systematic benthos sampling filter(!area %in% c("GSA_23","GSA_25")) %>% # Remove southern greec data : no systematic benthos sampling filter(!(y < 38.3 & y > 36.4 & x > 24.1 & x < 28.8)) %>% # Only keep years from 2016 to 2021 filter(year >= 2016 & year <= 2021) %>% # Remove data before 2018 in Malta and Western Greece : no systematic benthos sampling filter(!(area %in% c("GSA_15","GSA_20") & year < 2018)) %>% # Remove data before 2019 in Northern Greece : no systematic benthos sampling filter(!(area == "GSA_22" & year < 2019)) %>% # Define threshold to define rare taxa : 5% of total hauls per year and GSA group_by(year, area) %>% mutate(numb_haul_per_year = n_distinct(haul_number), tresh_haul = (5 * numb_haul_per_year)/100) %>% ungroup() %>% # Define threshold to define rare taxa : 60% of total hauls per GSA group_by(area) %>% mutate(tresh_year = 60 * n_distinct(year)/100) %>% ungroup() # Spatial vizualisation MED_BENT_filter_sf <- st_as_sf(MED_BENT_filter, coords = c("x", "y"), crs = 4326) mapview(MED_BENT_filter_sf) # Filtering of rare taxa # ---------------------- group_vars <- c("species", "genus", "family", "order" , "class") next_vars <- c("genus", "family", "order", "class" , "phylum") for (i in 1:5){ var <- group_vars[i] next_var <- next_vars[i] MED_BENT_filter <- MED_BENT_filter %>% group_by(year, area, .data[[var]]) %>% mutate(numb_haul_per_taxon = n_distinct(haul_number)) %>% ungroup() %>% group_by(area,.data[[var]]) %>% mutate(numb_year_per_taxon = n_distinct(year)) %>% ungroup() %>% # If a taxa fits the condition to be rare = the taxonomic level is increased mutate(taxon = case_when( .data[[next_var]] != "" & (numb_year_per_taxon < tresh_year | numb_haul_per_taxon < tresh_haul) ~ .data[[next_var]], TRUE ~ taxon)) } # Finalization of the formatting # ------------------------------ MED_BENT_filter <- MED_BENT_filter %>% # Remove taxa with rare phylum (jighest taxonomic level) group_by(area, year, phylum) %>% mutate(numb_haul_per_taxon = n_distinct(haul_number)) %>% ungroup() %>% group_by(area, phylum) %>% mutate(numb_year_per_taxon = n_distinct(year)) %>% ungroup() %>% filter(numb_year_per_taxon > tresh_year & numb_haul_per_taxon > tresh_haul) %>% # Columns selection dplyr::rename(g_km2 = p_km2) %>% dplyr::select(haul_id, area, year, haul_number, x, y, depth, taxon, g_km2) %>% # Sum biomass per taxa (Different taxa grouped under the same name after filtering rare taxa.) group_by(haul_id, taxon) %>% mutate(g_km2 = sum(g_km2)) %>% distinct(haul_id, taxon, .keep_all = TRUE) %>% ungroup() # Save save(MED_BENT_filter, file = "Data/MEDITS_data/MEDITS_UE_benthos/MED_BENT_filter.Rdata") ``` # Community matrix formatting ```{r} # Environmental variables stack env_stack <- rast("E:/PhD_2022_2023/Project/Bioregionalisation/R/Data/Environment/env_stack_med.tif") %>% subset(., !grepl("2000", names(.))) %>% subset(., !grepl("swd", names(.))) %>% subset(., !grepl("phyc", names(.))) %>% subset(., !grepl("SAR", names(.))) names(env_stack) <- gsub("_2010", "", names(env_stack)) names(env_stack) <- gsub("sws", "current_speed", names(env_stack)) names(env_stack) <- gsub("so", "salinity", names(env_stack)) names(env_stack) <- gsub("thetao", "temp", names(env_stack)) # Community matrix BENT_mat <- MED_BENT_filter %>% # Select columns dplyr::select(haul_id, area, year, x, y, taxon, g_km2) %>% # remove taxa with NA values mutate(taxon = gsub("\\s.*", "", taxon)) %>% # Sum biomass for similar taxa in a station group_by(haul_id, taxon) %>% mutate(g_km2 = sum(g_km2)) %>% # Check that combination of station x taxa are unique distinct(haul_id, taxon, .keep_all = TRUE) %>% ungroup() %>% # Pivot pivot_wider(names_from = "taxon", values_from = g_km2) %>% mutate_all(~replace(., is.na(.), 0)) # Save save(BENT_mat, file = "Data/Community_matrix/BENT_mat.Rdata") # Transform in vector BENT_mat_vect <- terra::vect(BENT_mat[,c(1,2,3,4,5)], geom = c("x","y"), crs = "epsg:4326") # Extract environmental variables for each benthic record BENT_extract <- terra :: extract(env_stack, BENT_mat_vect, ID = FALSE) %>% cbind(crds(BENT_mat_vect), values(BENT_mat_vect)) %>% drop_na() save(BENT_extract, file = "Data/Community_matrix/BENT_extract.Rdata") ``` ## Western basin ```{r} # GSA filter BENT_extract_WEST <- BENT_extract %>% # select GSA of the basin filter(area %in% c("GSA_1", "GSA_2", "GSA_5", "GSA_6", "GSA_7", "GSA_8", "GSA_9", "GSA_10", "GSA_11", "GSA_3", "GSA_4", "GSA_12")) # Environmental matrix BENT_env_WEST <- BENT_extract_WEST %>% dplyr::select(-area, -year, -x, -y) # Metadata matrix BENT_met_WEST <- BENT_extract_WEST %>% dplyr::select(haul_id, area, year, x, y) # Community matrix BENT_WEST <- inner_join(BENT_met_WEST, BENT_mat, by= c("haul_id","area","year","x", "y")) %>% column_to_rownames("haul_id") %>% dplyr::select( -area, -year, -x, -y) %>% # Biomass log transformation mutate_if(is.numeric, function(x) log(x+1)) %>% select_if(~ any(. != 0)) # Taxa vector WEST_taxa <- colnames(BENT_WEST) # Save save(BENT_env_WEST, file = "Data/Community_matrix/BENT_env_WEST.Rdata") save(BENT_met_WEST, file = "Data/Community_matrix/BENT_met_WEST.Rdata") save(BENT_WEST, file = "Data/Community_matrix/BENT_WEST.Rdata") ``` ## Central basin ```{r} # GSA filter BENT_extract_CENT <- BENT_extract %>% filter(area %in% c("GSA_13", "GSA_14", "GSA_15", "GSA_16","GSA_19", "GSA_20")) %>% # Remove outliers (Determined by the K-means analysis (in following script)) filter(!(haul_id %in% c("MEDITS_ITA_16_2016_22", "MEDITS_ITA_16_2016_41", "MEDITS_ITA_16_2016_78", "MEDITS_ITA_16_2016_83", "MEDITS_ITA_16_2016_84", "MEDITS_ITA_16_2016_85", "MEDITS_ITA_16_2016_86", "MEDITS_ITA_16_2016_102", "MEDITS_ITA_16_2016_103"))) # Environmental matrix BENT_env_CENT <- BENT_extract_CENT %>% dplyr::select(-area, -year, -x, -y) # Metadata matrix BENT_met_CENT <- BENT_extract_CENT %>% dplyr::select(haul_id, area, year, x, y) # Community matrix BENT_CENT <- inner_join(BENT_met_CENT, BENT_mat, by= c("haul_id","area","year","x", "y")) %>% column_to_rownames("haul_id") %>% dplyr::select(-area, -year, -x, -y) %>% # Biomass log transformation mutate_if(is.numeric, function(x) log(x+1)) %>% select_if(~ any(. != 0)) # Taxa vector CENT_taxa <- colnames(BENT_CENT) # Save save(BENT_env_CENT, file = "Data/Community_matrix/BENT_env_CENT.Rdata") save(BENT_met_CENT, file = "Data/Community_matrix/BENT_met_CENT.Rdata") save(BENT_CENT, file = "Data/Community_matrix/BENT_CENT.Rdata") ``` ## Adriatic basin ```{r} # GSA filter BENT_extract_ADRI <- BENT_extract %>% filter(area %in% c("GSA_17", "GSA_18")) # Environmental matrix BENT_env_ADRI <- BENT_extract_ADRI %>% dplyr::select(-area, -year, -x, -y) # Metadata matrix BENT_met_ADRI <- BENT_extract_ADRI %>% dplyr::select(haul_id, area, year, x, y) # Community matrix BENT_ADRI <- inner_join(BENT_met_ADRI, BENT_mat, by= c("haul_id","area","year","x", "y")) %>% column_to_rownames("haul_id") %>% dplyr::select( -area, -year, -x, -y) %>% # Biomass log transformation mutate_if(is.numeric, function(x) log(x+1)) %>% select_if(~ any(. != 0)) # Taxa vector ADRI_taxa <- colnames(BENT_ADRI) # Save save(BENT_env_ADRI, file = "Data/Community_matrix/BENT_env_ADRI.Rdata") save(BENT_met_ADRI, file = "Data/Community_matrix/BENT_met_ADRI.Rdata") save(BENT_ADRI, file = "Data/Community_matrix/BENT_ADRI.Rdata") ``` ## Aegean Sea ```{r} # GSA filter BENT_extract_AEGE <- BENT_extract %>% filter(area == "GSA_22") # Environmental matrix BENT_env_AEGE <- BENT_extract_AEGE %>% dplyr::select(-area, -year, -x, -y) # Metadata matrix BENT_met_AEGE <- BENT_extract_AEGE %>% dplyr::select(haul_id, area, year, x, y) # Community matrix BENT_AEGE <- inner_join(BENT_met_AEGE, BENT_mat, by= c("haul_id","area","year","x", "y")) %>% column_to_rownames("haul_id") %>% dplyr::select( -area, -year, -x, -y) %>% # Biomass log transformation mutate_if(is.numeric, function(x) log(x+1)) %>% select_if(~ any(. != 0)) # Taxa vector AEGE_taxa <- colnames(BENT_AEGE) # Save save(BENT_env_AEGE, file = "Data/Community_matrix/BENT_env_AEGE.Rdata") save(BENT_met_AEGE, file = "Data/Community_matrix/BENT_met_AEGE.Rdata") save(BENT_AEGE, file = "Data/Community_matrix/BENT_AEGE.Rdata") ``` ## Levantine basin ```{r} # Filter BENT_extract_LEVA <- BENT_extract %>% filter(area %in% c("GSA_24", "GSA_25", "GSA_26", "GSA_27")) %>% filter(!(haul_id %in% c("MEDITS_CYP_25_2019_25","MEDITS_CYP_25_2019_1", "MEDITS_CYP_25_2021_1", " MEDITS_CYP_25_2019_24","MEDITS_CYP_25_2021_8", "MEDITS_CYP_25_2021_24", "MEDITS_CYP_25_2018_5", "MEDITS_CYP_25_2018_12", "MEDITS_CYP_25_2021_5", "MEDITS_CYP_25_2019_27", "MEDITS_CYP_25_2020_24", "MEDITS_CYP_25_2020_27"))) # Set of matrix BENT_env_LEVA <- BENT_extract_LEVA %>% dplyr::select(-area, -year, -x, -y) BENT_met_LEVA <- BENT_extract_LEVA %>% dplyr::select(haul_id, area, year, x, y) BENT_LEVA <- inner_join(BENT_met_LEVA, BENT_mat, by= c("haul_id","area","year","x", "y")) %>% column_to_rownames("haul_id") %>% dplyr::select( -area, -year, -x, -y) %>% mutate_if(is.numeric, function(x) log(x+1)) %>% select_if(~ any(. != 0)) #decostand(method = "hellinger") LEVA_taxa <- colnames(BENT_LEVA) # Save save(BENT_env_LEVA, file = "Data/Community_matrix/BENT_env_LEVA.Rdata") save(BENT_met_LEVA, file = "Data/Community_matrix/BENT_met_LEVA.Rdata") save(BENT_LEVA, file = "Data/Community_matrix/BENT_LEVA.Rdata") ``` # Stack of environmental variable per sub-basin These raster layers will be used to make the prediction of bioregion ## Western basin ```{r} # GSA filter GSA_WEST <- st_read(here::here ("Data/Environment/Country/GSA/GSAs_simplified.shp")) %>% filter(SECT_COD %in% c("GSA01", "GSA02", "GSA03", "GSA04", "GSA05", "GSA06", "GSA07", "GSA08", "GSA09", "GSA10", "GSA111", "GSA112", "GSA12")) # Sub-basin extent ext_WEST <- ext(-6, 19, 34, 45) # Extent restriction of the stack env_stack_WEST <- env_stack %>% terra::mask(GSA_WEST) %>% crop(ext_WEST) # Save writeRaster(env_stack_WEST, filename = here::here("Data/Environment/env_stack_WEST.tif"), overwrite = TRUE) ``` ## Central basin ```{r} # GSA filter GSA_CENT <- st_read(here::here ("Data/Environment/Country/GSA/GSAs_simplified.shp")) %>% filter(SECT_COD %in% c("GSA13", "GSA14", "GSA15", "GSA16","GSA19", "GSA20", "GSA21")) # Sub-basin extent ext_CENT <- ext(10, 22.5, 30, 42) # Extent restriction of the stack env_stack_CENT <- env_stack %>% terra::mask(GSA_CENT) %>% crop(ext_CENT) # Save writeRaster(env_stack_CENT, filename = here::here("Data/Environment/env_stack_CENT.tif"), overwrite = TRUE) ``` ## Adriatic basin ```{r} # GSA filter GSA_ADRI <- st_read(here::here ("Data/Environment/Country/GSA/GSAs_simplified.shp")) %>% filter(SECT_COD %in% c("GSA17", "GSA18")) # Sub-basin extent ext_ADRI <- ext(11, 22, 39, 46.5) # Extent restriction of the stack env_stack_ADRI <- env_stack %>% terra::mask(GSA_ADRI) %>% crop(ext_ADRI) # Save writeRaster(env_stack_ADRI, filename = here::here("Data/Environment/env_stack_ADRI.tif"), overwrite = TRUE) ``` ## AEGEAN basin ```{r} # GSA filter GSA_AEGE <- st_read(here::here ("Data/Environment/Country/GSA/GSAs_simplified.shp")) %>% filter(SECT_COD %in% c("GSA22", "GSA23")) # Sub-basin extent ext_AEGE <- ext(22.5, 29.5, 34, 41.5) # Extent restriction of the stack env_stack_AEGE <- env_stack %>% terra::mask(GSA_AEGE) %>% crop(ext_AEGE) # Save writeRaster(env_stack_AEGE, filename = here::here("Data/Environment/env_stack_AEGE.tif"), overwrite = TRUE) ``` # Maps of the study area MEDITS stations + GSA colored by sub-basins ```{r} # Coastline Med sf_use_s2(FALSE) coastline_med <- st_read("E:/PhD_2022_2023/Project/SDM/R/Data/Country/land-polygons/coastline_med_rap.shp") # Extent study_extent_med <- ext(-6,37,30,46) # Mask bottoms above 1000m below_1000m <- rast(here::here("Data/Environment/Terrain/depth.tif")) %>% terra :: crop(study_extent_med) %>% clamp(upper=-1000, values = FALSE) # Assign GSA to each sub-basin sub_basin <- st_read(here::here ("Data/Environment/Country/GSA/GSAs_simplified.shp")) %>% dplyr::select(SECT_COD, SMU_NAME) %>% filter(!SECT_COD %in% c("GSA24", "GSA25", "GSA26", "GSA27","GSA28","GSA29","GSA30")) %>% mutate(REGION_NAME = case_when( SECT_COD %in% c("GSA01", "GSA02", "GSA03", "GSA04", "GSA05", "GSA06", "GSA07", "GSA08", "GSA09", "GSA10", "GSA111", "GSA112") ~ "Western Mediterranean", SECT_COD %in% c("GSA12", "GSA13", "GSA14", "GSA15", "GSA16","GSA19", "GSA20","GSA21") ~ "Central Mediterranean", SECT_COD %in% c("GSA17", "GSA18") ~ "Adriatic Sea", SECT_COD %in% c("GSA22", "GSA23") ~ "Aegean Sea", SECT_COD %in% c("GSA24", "GSA25", "GSA26", "GSA27") ~ "Levantine Sea")) # Color palette palette <- c("mediumpurple","darkseagreen2", "skyblue1", "indianred1") # MEDITS stations in spatial object hauls <- MED_BENT_filter %>% st_as_sf(coords = c("x","y"), crs = 4326) # Map map_sub_basin <- # Mediterranean coastline tm_shape(coastline_med) + tm_polygons(col="grey92", border.col = "black", lwd = 0.3) + # Sub-basin tm_shape(sub_basin) + tm_polygons("REGION_NAME", palette = palette, legend.show = TRUE, title = "Sub-basin", alpha = 0.8, border.alpha = 0) + # Mask 1000m tm_shape(below_1000m) + tm_raster(style = "cont", palette = "white", title = "",legend.show = FALSE) + # MEDITS stations tm_shape(hauls) + tm_dots(col = "black", size = 0.1) + tm_compass(type="4star", position=c("left", "top"), size = 2, show.labels = 2) + # Additional elements of the map tm_scale_bar(position=c("right", "bottom")) + tm_graticules(labels.inside.frame = F, lines = F, labels.size = 1.2) + tm_layout(frame = T, inner.margins = 0, saturation = 1, legend.outside = FALSE, legend.text.size = 1, legend.title.size = 1.3, legend.position = c("left", "bottom")) map_sub_basin # Save tmap_save(map_sub_basin, "E:/PhD_2022_2023/Project/Bioregionalisation/R/Output/Clust_pred/map_sub_basin.png", width = 12, height = 7) ``` # Venn Diagram To compare the number of common taxa across sub-basins ```{r} png("E:/PhD_2022_2023/Project/Bioregionalisation/R/Output/Clust_pred/venn_diagram.png", width = 800, height = 800) venn.plot <- venn.diagram( x = list(vec1 = WEST_taxa, vec2 = EAST_taxa, vec3 = CENT_taxa, vec4 = ADRI_taxa), category.names = c("Western basin", "Aegean Sea", "Central basin", "Adriatic Sea"), filename = NULL, output = TRUE, fill = c("lightpink", "lightgreen", "lightblue", "lavender"), cat.cex = 2, cex = 2 ) grid.draw(venn.plot) dev.off() ```