############## Bio pRedictions ############## Paweł Bogawski ############## Part 1 ############## Species distribution modelling #--------- ############### Section 1 - presence data #library(rJava) #install.packages("rgdal") library(rgdal) library(raster) # terra library(sp) # sf library(dismo) library(tidyverse) library(reshape2) #Sys.getenv("JAVA_HOME") library(maptools) #For country boundaries data(wrld_simpl) #Setting a working directory choose.dir() setwd("C:\Users\user\Downloads\biopredictions") ########## #### 1. Find coordinates of species locations ########## ##### Importing presence points #First they should be dowloaded for example from GBIF website https://www.gbif.org/ #par(mfrow = c(1,1)) ly2 <- read.csv("wombat_lokalizacje.csv", sep=";") # importing data dowloaded from gbif/may be also our own dataset head(ly2) #checking if the location data are loaded properly #ly2 <- ly2[,-c(1,4)] #removing columns dim(ly2) #checking the size of the location dataset #### #Alternatively we can download species records from gbif using R interface (function gbif from dismo pckage) ext <- extent(6.5, 10.5, 50.0, 54.5) g <- gbif('Hordeum', 'vulgare', geo=TRUE, download = TRUE,ext=ext) # first genus, second arg - species, geo=TRUE means that only records with ccordinates are downloaded head(g) #checking g_cleaned <- g %>% dplyr::select(lat, lon) #cleaning to necessary columns head(g_cleaned) plot(g_cleaned$lon, g_cleaned$lat) #checking points ly2 <- g_cleaned ########## #### 2. Decrease spatial autocorrelation in species locations ########## #Changing the class dataframe (ly2) to the spatial point data frame class (ly1) ly1 <- ly2 coordinates(ly1) <- ~lon+lat #selecting columns for showing the locations (longitude and latitude) projection(ly1) <- CRS('+proj=longlat +datum=WGS84') #attribution of coordinate system ly2 # data frame ly1 # spatial points # create a grid for removing all locations but 1 per the grid cell r <- raster(ly1) # from package raster # set the resolution of the cells to the same resolution as the environmental data res(r) <- 0.1666667 # setting raster resolution # expand (extend) the extent of the RasterLayer a little r <- extend(r, extent(r)+1) # adding one column and one row of cells to the raster # sample only one point per grid ly_sel <- gridSample(ly1, r, n=1) dim(ly_sel) plot(ly_sel) ly3 <- as.data.frame(ly_sel) #saving results as data frame head(ly3) ly2 <- ly3[sample(nrow(ly3), 100), ] # limiting presence records for clarity plot(ly2$lon, ly2$lat) ly4 <- ly2 coordinates(ly4) <- ~lon+lat #selecting columns for showing the locations (longitude and latitude) projection(ly4) <- CRS('+proj=longlat +datum=WGS84') #attribution of coordinate system ly2 # data frame ly4 # spatial points ########## #### 3. Collect data on the environmental conditions/climatic conditions ########## # data downloaded from https://worldclim.org/data/worldclim21.html (Bioclimatic variables - resolution 10 m) # first the data should be unzipped to a working directory # get the file names for predictor variables files <- list.files(path='.', pattern='tif$', full.names=TRUE ) #listing all files in a working directory that are with a specified extension files # we use the first file to create a RasterStack env <- stack(files) env # checking the paramters of the stacked data plot(env$wc2.1_10m_bio_1) # display one layer from the set of layers e <- ext # creating a rectangle showing study area plot(e, add= TRUE) predictors <- crop(env, e) # clipping the environmental data to the study area for faster and more reliable computation plot(predictors$wc2.1_10m_bio_12) # check if the data were plotted # creating predictors names(predictors) # show current names of the layers #assign new names to the layers for simplicity names(predictors) <- c("bio1", "bio10", "bio11","bio12","bio13","bio14","bio15","bio16","bio17","bio18","bio19", "bio2", "bio3","bio4","bio5","bio6","bio7","bio8","bio9") plot(predictors$bio10) # checking if this works - display one environmental variable clipped to the study area ########## #### 4. Identify which environmental data are correlated to each other and remove one of the highly correlated variables ########## # changing the class of our study area to class Spatial Polygons - this is nneded to spatially sample points from that area e_area2 <- as(e, 'SpatialPolygons') # Study area samp_var <- spsample(e_area2, 1000, type='random', iter=25) #random background points for studying relationships between variables #Plotting plot(predictors$bio2) #plotting example variable: bio2 - mean diurnal temperature range predictors$bio2 # checking information on the bio2 layer plot(e_area2, add=T) plot(samp_var, cex=0.1, add=T) #extracting values from a raster stack for correlation samp_var #checking the object - randomly sampled points vals <- na.omit(raster::extract(predictors, samp_var)) # extracting data from raster dataset head(vals) # checking values dim(vals) # checking table size #correlation matrix library(corrplot) # to visualize correlations corm <- cor(vals) # create a correlaton matrix head(corm) # checking corrplot(corm, method="number", type="upper", order="alphabet", number.cex = 0.8) # vcorrelation matrix visualisation head(vals) # most evident variables to remove: bio 12, bio1, bio10, bio16, bio19, bio5, bio7 library(dbplyr) library(tidyverse) vals1 <- as.data.frame(vals) %>% select(-bio10, -bio13, -bio14, -bio12) head(vals1) # next round - looking for other variables that still are correlated to each other corm1 <- cor(vals1) corrplot(corm1, method="number", type="upper", order="alphabet", number.cex = 0.8) vals2 <- as.data.frame(vals1) %>% select(-bio13, -bio11, -bio4, -bio17) head(vals2) #there still are variables correlated to each other with correlation > |-0.75| corm2 <- cor(vals2) corrplot(corm2, method="number", type="upper", order="alphabet", number.cex = 0.8) ## Now every variable if correlated to others, the correlation is below the threshold #predictors - variant with low correlation: bio2, bio3, bio6, bio8, bio9, bio14, bio15, bio18 # Limiting predictor variables to only those which are correlated below the threshold 0.75 names(predictors) nam <- c("bio15" ,"bio18" ,"bio2" , "bio3" , "bio8" , "bio9" , "bio13", "bio1", "bio4", "bio5", "bio7") predictors predictors1 <- subset(predictors, subset = nam) # subsetting predictors dataset to variables specified in vector nam plot(predictors1) # checking ########## #### 5-6. Extract environmental data for species locations and background ########## #Creating mask for point sampling - country boundaries # If we have spatial data in shapefile format mask <- readOGR(".",layer="AUS_single") plot(mask) mask <- raster::intersect(mask, e_area2) ###downloading country boundaries if we do not have spatial data on boundaries library(GADMTools) aus <- gadm_sp_loadCountries(c("DNK", "DEU", "SWE"), level=0, basefile="./", simplify = 0.025) #downloading aus_p <- readRDS("DEU_adm0.rds") # loading projection(aus_p) <- CRS('+proj=longlat +datum=WGS84') #assigning projection ausp <- disaggregate(aus_p) # disaggregating Australia and surrounding islands ausp <- disaggregate(aus_p) head(ausp) # checking if the polygons were separated ausp$area_sqkm <- area(ausp)/1000000 # calculating area of the islands ausp <- ausp[rev(order(ausp$area_sqkm)),] # sorting from the largest polygon to the smalles ausp <- ausp[1:2,] # selecting only two largest polygnons Australia and Tasmania plot(ausp) # Checking mask <- raster::intersect(ausp, e_area2) # clipping to study area plot(e_area2, add=TRUE) plot(ausp, add = TRUE) e_area2 projection(e_area2) <- CRS('+proj=longlat +datum=WGS84') #Visualize a domain for background points par(mfrow=c(1,1)) samp_var <- spsample(mask, 1000, type='random', iter=25) # sampling for background points plot(mask) points(samp_var, cex=0.5) #Visualize presence records and background data plot(mask) plot(ly4, add = TRUE, cex = 0.5, col = "red") plot(samp_var, add = TRUE, cex = 0.1) axis(side = 1) axis(side = 2) box() ##### Creating presence and background datasets by extracting environmental data #background points alone fr training backg <- randomPoints(predictors1, n=1000, ext=mask, extf = 1.25) #background points for training Europe/Extent1 colnames(backg) = c('lon', 'lat') head(backg) #background points alone fr testing #backg_test <- randomPoints(predictors1, n=1000, ext=mask, extf = 1.25) #background points for testing Europe/Extent1 #colnames(backg_test) = c('lon', 'lat') #presence points alone ly4 # spatial dataset ly2 # data frame #presence points with environmental variables for bioclim model for training presvals1 <- raster::extract(predictors1, ly4) # extracting environmental data for presence points absvals1 <- raster::extract(predictors1, backg) # extracting environmental data for background points pb1 <- c(rep(1, nrow(presvals1)), rep(0, nrow(absvals1))) # creating factor variable that show 1 for species location and 0 for background points unique(pb1) sdmdata1 <- data.frame(cbind(pb1, rbind(presvals1, absvals1))) # combining factor variable with environmental data head(sdmdata1) # checking 1st column shows if the species exists (1) or not (0) tail(sdmdata1) # checking ########## #### 7-8. Calibrate the model - evaluate the model ########## ## Database for Bioclim model prepared pres <- sdmdata1[sdmdata1[,1] == 1, 2:9] #creating dataframe with conditions for the species back <- sdmdata1[sdmdata1[,1] == 0, 2:9] #creating dataframe with conditions for the background head(back) #without the factor variable head(pres) #without the factor variabel ## Model evaluation - 10-fold crossvalidation group <- kfold(ly2, 10) # creating folds (every record is attributed to 1 ofthe 10 group) group k=10 # setting limit ob <- list() #creating empty list for storing results from the validation # Validation for (i in 1:k) { train <- pres[group != i,] # selecting all data but one group marked with i that is left for validation test <- pres[group == i,] # selecting one group for validation bc <- bioclim(train) # training the model ob[[i]] <- evaluate(p=test, a=back, bc) # evaluation of a one particular iteration of the model and storing in a list } ob[[1]] #checking first iteration # Calculation of the metrics used in the evaluation of a model # The metrics is Area Under the ROC (Receiver Operation Characteristics) Curve (AUC) # It is the relationship between True positive rate (proportion of observed presences that are # correctly predicted) and False positive rate (proportion of observed abscences that are # incorrectly predicted). Interpretation: the higher the AUC, the better the model. aucb <- sapply(ob, function(x){x@auc}) #selecting AUC from the evaluation results aucb # Calculating thresholds - MAX Sensitivity + specificity # i.e. the threshold at which the sum of the sensitivity (true positive rate) # and specificity (true negative rate) is highest # it is possible to select also other thresholds, see manual to dismo package tr <- sapply(ob, function(x) threshold(x, 'spec_sens')) #selecting threshold tr ########## #### 9. Using the model to predict current species distributions ########## pb <- predict(predictors1, bc, ext=mask, progress='') #creating prediction map pb par(mfrow=c(1,2)) #divide the plot window to two columns plot(pb, main='Bioclim, raw values') #raw prediction values 0 - means unsuitable conditions/ 1 means perfect conditions for a species plot(wrld_simpl, add=TRUE, border='dark grey') #adding country boundaries plot(pb > mean(tr), main='presence/absence') # binarizing the conditions based on threshold - if the predicted value is higher than the threshold the pixel is considered as the potential habitat plot(wrld_simpl, add=TRUE, border='dark grey') #adding country boundaries to second map points(ly4, pch='+') #adding point locations ## ---- Evaluation of Random Forest models---------------------------------- ###Evaluation of random forest RF models ### A bit different data frame is needed to properly calibrate and evaluate random forest model dim(ly2) head(ly2) names(ly2) <- c("lon", "lat") # assigning column names head(backg) plot(ly2, add = TRUE, col = "red") train <- rbind(ly2, backg) # combining locations of the species presence and background points locations tail(train) # checking dim(train) pb_train <- c(rep(1, nrow(ly2)), rep(0, nrow(backg))) #creating grouping factor variable pb_train head(pb_train) #checking plot(train$lon, train$lat) coordinates(train) <- ~lon+lat #making spatial points projection(train) <- CRS('+proj=longlat +datum=WGS84') # adding projection envtrain <- raster::extract(predictors1, train) #creating datas head(envtrain) envtrain <- data.frame(cbind(pa=pb_train, envtrain)) # combining grouping factor with the extracted environmental data head(envtrain) length(envtrain$pa[envtrain$pa==1]) #checking length - number of presences dim(envtrain) #Training background data backg_envtrain <- raster::extract(predictors1, backg) #creating background dataset for testing head(backg_envtrain) library(randomForest) envtrain1 <- na.omit(envtrain) #removing NA pixels (extracted from ocean) model <- pa ~ . #model formulation rf1 <- randomForest(model, data=envtrain1, importance=TRUE) # model use #Model evaluation k=10 group1 <- kfold(envtrain1, 10, by=envtrain1$pa) group1 rr <- list() #head(envtrain) for (i in 1:k) { train <- envtrain1[group1 != i,] test <- envtrain1[group1 == i,] rf1 <- randomForest(pa~.,data=train, importance=TRUE) rr[[i]] <- evaluate(p=test[test$pa==1,], a=backg_envtrain, rf1) } #Further is very similar to bioclim evaluation, using the same measure (AUC) and the same threshold (spec_sens) aucb_rf <- sapply(rr, function(x){x@auc}) aucb_rf tr_rf <- sapply(rr, function(x) threshold(x, 'spec_sens')) tr_rf rf2 <- randomForest(model, data=envtrain1, importance=TRUE) pb_rf <- predict(predictors1, rf2, ext=mask, progress='') pb_rf # Measuring the importance of each variable impo <- as.data.frame(rf2$importance) arrange(impo, desc(impo$`%IncMSE`)) impo$`%IncMSE` #Current prediction of the potential range par(mfrow=c(1,2)) plot(pb_rf, main='Random Forest, raw values') plot(wrld_simpl, add=TRUE, border='dark grey') plot(pb_rf > mean(tr_rf), main='presence/absence') plot(wrld_simpl, add=TRUE, border='dark grey') points(ly4, pch='+') ####### ### 10. Model selection ####### #comparison of the quality - bioclim and random forest based on AUC par(mfrow = c(1,1)) boxplot(aucb, aucb_rf) shapiro.test(aucb) #Checking significance of the differences t.test(aucb, aucb_rf, paired = FALSE, exact = FALSE) # library(rnaturalearth) library(rnaturalearthdata) ?ne_countries spdf_africa <- ne_countries(continent = 'africa') library(sp) library(raster) afr <- ne_download() class(spdf_africa) # library(rworldmap) sPDF <- getMap() # World map saveRDS(sPDF, "sPDF.rds") #Has to be saved (without saving this does not work) Map <- readRDS('sPDF.RDS') afr <- subset(Map, Map$continent == "Africa") plot(afr) class(afr) sPDF <- na.om head(sPDF) sPDF$continent selected <- "Europe" class(afr) afr$REGION afr <- sPDF[sPDF@data$continent == "Europe", ] afr <- sPDF[sPDF@data$continent %in% selected,] plot(afr) sPDF@data$continent afr <- sPDF %>% dplyr::filter(continent == "Africa") #mapCountries using the 'continent' attribute map <- mapCountryData(sPDF, nameColumnToPlot='continent') class(sPDF) plot(sPDF) sPDF$continent class(map) map$colourVector