############## Bio pRedictions
############## Paweł Bogawski

############## Part 1
############## Species distribution modelling
#---------
############### Section 1 - presence data
#library(rJava)
#install.packages("rgdal")
library(rgdal)
library(raster) # terra
library(sp)  # sf
library(dismo)
library(tidyverse)
library(reshape2)

#Sys.getenv("JAVA_HOME")
library(maptools) #For country boundaries
data(wrld_simpl)

#Setting a working directory
choose.dir()
setwd("C:\Users\user\Downloads\biopredictions")


##########
#### 1. Find coordinates of species locations
##########

##### Importing presence points
#First they should be dowloaded for example from GBIF website https://www.gbif.org/ 
#par(mfrow = c(1,1))
ly2 <- read.csv("wombat_lokalizacje.csv", sep=";") # importing data dowloaded from gbif/may be also our own dataset
head(ly2)  #checking if the location data are loaded properly
#ly2 <- ly2[,-c(1,4)] #removing columns
dim(ly2) #checking the size of the location dataset

####
#Alternatively we can download species records from gbif using R interface (function gbif from dismo pckage)
ext <- extent(6.5, 10.5, 50.0, 54.5)
g <- gbif('Hordeum', 'vulgare', geo=TRUE, download = TRUE,ext=ext) # first genus, second arg - species, geo=TRUE means that only records with ccordinates are downloaded
head(g) #checking
g_cleaned <- g %>% dplyr::select(lat, lon) #cleaning to necessary columns
head(g_cleaned)
plot(g_cleaned$lon, g_cleaned$lat) #checking points
ly2 <- g_cleaned

##########
#### 2. Decrease spatial autocorrelation in species locations
##########
#Changing the class dataframe (ly2) to the spatial point data frame class (ly1)
ly1 <- ly2
coordinates(ly1) <- ~lon+lat #selecting columns for showing the locations (longitude and latitude)
projection(ly1) <- CRS('+proj=longlat +datum=WGS84') #attribution of coordinate system
ly2 # data frame
ly1 # spatial points

# create a grid for removing all locations but 1 per the grid cell
r <- raster(ly1) # from package raster
# set the resolution of the cells to the same resolution as the environmental data
res(r) <- 0.1666667 # setting raster resolution
# expand (extend) the extent of the RasterLayer a little
r <- extend(r, extent(r)+1) # adding one column and one row of cells to the raster
# sample only one point per grid
ly_sel <- gridSample(ly1, r, n=1)
dim(ly_sel)
plot(ly_sel)
ly3 <- as.data.frame(ly_sel) #saving results as data frame
head(ly3)
ly2 <- ly3[sample(nrow(ly3), 100), ] # limiting presence records for clarity
plot(ly2$lon, ly2$lat)

ly4 <- ly2
coordinates(ly4) <- ~lon+lat #selecting columns for showing the locations (longitude and latitude)
projection(ly4) <- CRS('+proj=longlat +datum=WGS84') #attribution of coordinate system
ly2 # data frame
ly4 # spatial points

##########
#### 3. Collect data on the environmental conditions/climatic conditions
##########
# data downloaded from https://worldclim.org/data/worldclim21.html (Bioclimatic variables - resolution 10 m)
# first the data should be unzipped to a working directory

# get the file names for predictor variables
files <- list.files(path='.',  pattern='tif$',  full.names=TRUE ) #listing all files in a working directory that are with a specified extension
files

# we use the first file to create a RasterStack
env <- stack(files)
env # checking the paramters of the stacked data
plot(env$wc2.1_10m_bio_1)  # display one layer from the set of layers
e <- ext  # creating a rectangle showing study area
plot(e, add= TRUE)
predictors <- crop(env, e) # clipping the environmental data to the study area for faster and more reliable computation
plot(predictors$wc2.1_10m_bio_12) # check if the data were plotted

# creating predictors

names(predictors) # show current names of the layers

#assign new names to the layers for simplicity
names(predictors) <- c("bio1", "bio10", "bio11","bio12","bio13","bio14","bio15","bio16","bio17","bio18","bio19",
                       "bio2", "bio3","bio4","bio5","bio6","bio7","bio8","bio9")

plot(predictors$bio10) # checking if this works - display one environmental variable clipped to the study area

##########
#### 4. Identify which environmental data are correlated to each other and remove one of the highly correlated variables
##########

# changing the class of our study area to class Spatial Polygons - this is nneded to spatially sample points from that area
e_area2 <- as(e, 'SpatialPolygons') # Study area
samp_var <- spsample(e_area2, 1000, type='random', iter=25) #random background points for studying relationships between variables

#Plotting
plot(predictors$bio2) #plotting example variable: bio2 - mean diurnal temperature range
predictors$bio2 # checking information on the bio2 layer
plot(e_area2, add=T)
plot(samp_var, cex=0.1, add=T)

#extracting values from a raster stack for correlation 
samp_var #checking the object - randomly sampled points
vals <- na.omit(raster::extract(predictors, samp_var)) # extracting data from raster dataset
head(vals) # checking values
dim(vals) # checking table size

#correlation matrix
library(corrplot) # to visualize correlations

corm <- cor(vals) # create a correlaton matrix
head(corm) # checking
corrplot(corm, method="number", type="upper", 
         order="alphabet",
         number.cex = 0.8) # vcorrelation matrix visualisation
head(vals)
# most evident variables to remove: bio 12, bio1, bio10, bio16, bio19, bio5, bio7
library(dbplyr)
library(tidyverse)
vals1 <- as.data.frame(vals) %>% select(-bio10, -bio13, -bio14, -bio12)
                     
head(vals1)
# next round - looking for other variables that still are correlated to each other 
corm1 <- cor(vals1)
corrplot(corm1, method="number", type="upper", 
         order="alphabet",
         number.cex = 0.8)

vals2 <- as.data.frame(vals1) %>% select(-bio13, -bio11, -bio4, -bio17)
head(vals2)

#there still are variables correlated to each other with correlation > |-0.75|
corm2 <- cor(vals2)
corrplot(corm2, method="number", type="upper", 
         order="alphabet",
         number.cex = 0.8)
## Now every variable if correlated to others, the correlation is below the threshold
#predictors - variant with low correlation: bio2, bio3, bio6, bio8, bio9, bio14, bio15, bio18

# Limiting predictor variables to only those which are correlated below the threshold 0.75
names(predictors)
nam <- c("bio15" ,"bio18" ,"bio2" , "bio3"  , "bio8" , "bio9" , 
          "bio13", "bio1", "bio4", "bio5", "bio7")
predictors

predictors1 <- subset(predictors, subset = nam) # subsetting predictors dataset to variables specified in vector nam
plot(predictors1) # checking

##########
#### 5-6. Extract environmental data for species locations and background
##########

#Creating mask for point sampling - country boundaries
# If we have spatial data in shapefile format 
mask <- readOGR(".",layer="AUS_single")
plot(mask)
mask <- raster::intersect(mask, e_area2)

###downloading country boundaries if we do not have spatial data on boundaries
library(GADMTools)
aus <- gadm_sp_loadCountries(c("DNK", "DEU", "SWE"), level=0, basefile="./", simplify = 0.025) #downloading
aus_p <- readRDS("DEU_adm0.rds")  # loading
projection(aus_p) <- CRS('+proj=longlat +datum=WGS84') #assigning projection
ausp <- disaggregate(aus_p) # disaggregating Australia and surrounding islands
ausp <- disaggregate(aus_p)
head(ausp) # checking if the polygons were separated 

ausp$area_sqkm <- area(ausp)/1000000 # calculating area of the islands
ausp <- ausp[rev(order(ausp$area_sqkm)),] # sorting from the largest polygon to the smalles
ausp <- ausp[1:2,] # selecting only two largest polygnons Australia and Tasmania
plot(ausp) # Checking
mask <- raster::intersect(ausp, e_area2) # clipping to study area
plot(e_area2, add=TRUE)
plot(ausp, add = TRUE)
e_area2
projection(e_area2) <- CRS('+proj=longlat +datum=WGS84')

#Visualize a domain for background points
par(mfrow=c(1,1))
samp_var <- spsample(mask, 1000, type='random', iter=25) # sampling for background points
plot(mask)
points(samp_var, cex=0.5)

#Visualize presence records and background data
plot(mask)
plot(ly4, add = TRUE, cex = 0.5, col = "red")
plot(samp_var, add = TRUE, cex = 0.1)
axis(side = 1)
axis(side = 2)
box()

##### Creating presence and background datasets by extracting environmental data

#background points alone fr training
backg <- randomPoints(predictors1, n=1000, ext=mask, extf = 1.25) #background points for training Europe/Extent1
colnames(backg) = c('lon', 'lat')
head(backg)

#background points alone fr testing
#backg_test <- randomPoints(predictors1, n=1000, ext=mask, extf = 1.25) #background points for testing Europe/Extent1
#colnames(backg_test) = c('lon', 'lat')

#presence points alone
ly4 # spatial dataset
ly2 # data frame

#presence points with environmental variables for bioclim model for training
presvals1 <- raster::extract(predictors1, ly4) # extracting environmental data for presence points
absvals1 <- raster::extract(predictors1, backg) # extracting environmental data for background points

pb1 <- c(rep(1, nrow(presvals1)), rep(0, nrow(absvals1))) # creating factor variable that show 1 for species location and 0 for background points
unique(pb1)
sdmdata1 <- data.frame(cbind(pb1, rbind(presvals1, absvals1))) # combining factor variable with environmental data
head(sdmdata1) # checking 1st column shows if the species exists (1)  or not (0)
tail(sdmdata1) # checking 

##########
#### 7-8. Calibrate the model - evaluate the model
##########

## Database for Bioclim model prepared
pres <- sdmdata1[sdmdata1[,1] == 1, 2:9] #creating dataframe with conditions for the species
back <- sdmdata1[sdmdata1[,1] == 0, 2:9] #creating dataframe with conditions for the background
head(back) #without the factor variable
head(pres) #without the factor variabel

## Model evaluation - 10-fold crossvalidation
group <- kfold(ly2, 10) # creating folds (every record is attributed to 1 ofthe 10 group)
group 
k=10 # setting limit
ob <- list() #creating empty list for storing results from the validation

# Validation
for (i in 1:k) {
  train <- pres[group != i,]  # selecting all data but one group marked with i that is left for validation
  test <- pres[group == i,] # selecting one group for validation
  bc <- bioclim(train) # training the model
  ob[[i]] <- evaluate(p=test, a=back, bc)  # evaluation of a one particular iteration of the model and storing in a list
}

ob[[1]] #checking first iteration

# Calculation of the metrics used in the evaluation of a model
# The metrics is Area Under the ROC (Receiver Operation Characteristics) Curve (AUC)

# It is the relationship between True positive rate (proportion of observed presences that are
# correctly predicted) and False positive rate (proportion of observed abscences that are 
# incorrectly predicted). Interpretation: the higher the AUC, the better the model. 

aucb <- sapply(ob, function(x){x@auc}) #selecting AUC from the evaluation results
aucb

# Calculating thresholds - MAX Sensitivity + specificity
# i.e. the threshold at which the sum of the sensitivity (true positive rate) 
# and specificity (true negative rate) is highest

# it is possible to select also other thresholds, see manual to dismo package

tr <- sapply(ob, function(x) threshold(x, 'spec_sens')) #selecting threshold
tr

##########
#### 9. Using the model to predict current species distributions
##########

pb <- predict(predictors1, bc, ext=mask, progress='') #creating prediction map
pb  
par(mfrow=c(1,2)) #divide the plot window to two columns
plot(pb, main='Bioclim, raw values') #raw prediction values 0 - means unsuitable conditions/ 1 means perfect conditions for a species
plot(wrld_simpl, add=TRUE, border='dark grey') #adding country boundaries
plot(pb > mean(tr), main='presence/absence') # binarizing the conditions based on threshold - if the predicted value is higher than the threshold the pixel is considered as the potential habitat
plot(wrld_simpl, add=TRUE, border='dark grey') #adding country boundaries to second map
points(ly4, pch='+') #adding point locations


## ---- Evaluation of Random Forest models----------------------------------
###Evaluation of random forest RF models 
### A bit different data frame is needed to properly calibrate and evaluate random forest model

dim(ly2)
head(ly2)
names(ly2) <- c("lon", "lat") # assigning column names
head(backg)
plot(ly2, add = TRUE, col = "red")
train <- rbind(ly2, backg) # combining locations of the species presence and background points locations
tail(train) # checking
dim(train)
pb_train <- c(rep(1, nrow(ly2)), rep(0, nrow(backg))) #creating grouping factor variable
pb_train
head(pb_train) #checking
plot(train$lon, train$lat)
coordinates(train) <- ~lon+lat #making spatial points
projection(train) <- CRS('+proj=longlat +datum=WGS84') # adding projection
envtrain <- raster::extract(predictors1, train) #creating datas

head(envtrain)
envtrain <- data.frame(cbind(pa=pb_train, envtrain)) # combining grouping factor with the extracted environmental data

head(envtrain)
length(envtrain$pa[envtrain$pa==1]) #checking length - number of presences
dim(envtrain)

#Training background data
backg_envtrain <- raster::extract(predictors1, backg) #creating background dataset for testing
head(backg_envtrain)

library(randomForest)
envtrain1 <- na.omit(envtrain) #removing NA pixels (extracted from ocean)
model <- pa ~ . #model formulation
rf1 <- randomForest(model, data=envtrain1, importance=TRUE) # model use

#Model evaluation

k=10
group1 <- kfold(envtrain1, 10, by=envtrain1$pa)
group1

rr <- list()
#head(envtrain)

for (i in 1:k) {
  train <- envtrain1[group1 != i,]
  test <- envtrain1[group1 == i,]
  rf1 <- randomForest(pa~.,data=train, importance=TRUE)
  rr[[i]] <- evaluate(p=test[test$pa==1,], a=backg_envtrain, rf1)
}

#Further is very similar to bioclim evaluation, using the same measure (AUC) and the same threshold (spec_sens)
aucb_rf <- sapply(rr, function(x){x@auc})
aucb_rf
tr_rf <- sapply(rr, function(x) threshold(x, 'spec_sens'))
tr_rf

rf2 <- randomForest(model, data=envtrain1, importance=TRUE)
pb_rf <- predict(predictors1, rf2, ext=mask, progress='')
pb_rf

# Measuring the importance of each variable
impo <- as.data.frame(rf2$importance)
arrange(impo, desc(impo$`%IncMSE`))
impo$`%IncMSE`

#Current prediction of the potential range
par(mfrow=c(1,2))
plot(pb_rf, main='Random Forest, raw values')
plot(wrld_simpl, add=TRUE, border='dark grey')
plot(pb_rf > mean(tr_rf), main='presence/absence')
plot(wrld_simpl, add=TRUE, border='dark grey')
points(ly4, pch='+')

#######
### 10. Model selection
#######
#comparison of the quality - bioclim and random forest based on AUC 
par(mfrow = c(1,1))
boxplot(aucb, aucb_rf)

shapiro.test(aucb)

#Checking significance of the differences
t.test(aucb, aucb_rf, paired = FALSE, exact = FALSE)

#
library(rnaturalearth)
library(rnaturalearthdata)
?ne_countries
spdf_africa <- ne_countries(continent = 'africa')
library(sp)
library(raster)
afr <- ne_download()
class(spdf_africa)
#
library(rworldmap) 
sPDF <- getMap() # World map
saveRDS(sPDF, "sPDF.rds") #Has to be saved (without saving this does not work)
Map <- readRDS('sPDF.RDS')
afr <- subset(Map, Map$continent == "Africa")
plot(afr)
class(afr)

sPDF <- na.om
head(sPDF)
sPDF$continent
selected <- "Europe"

class(afr)
afr$REGION
afr <- sPDF[sPDF@data$continent == "Europe", ]
afr <- sPDF[sPDF@data$continent %in% selected,]
plot(afr)

sPDF@data$continent

afr <- sPDF %>% dplyr::filter(continent == "Africa")

#mapCountries using the 'continent' attribute  
map <- mapCountryData(sPDF, nameColumnToPlot='continent')
class(sPDF)
plot(sPDF)
sPDF$continent
class(map)
map$colourVector