#DATAFRAME DEVELOPMENT####

#Data development for analysis of telecoupling, ES specialisation, and comparative advantage.
#Data on ES production and flows from Gera, Gumay & Setema woreda, Jimma zone, Ethiopia.
#Here: calculate telecoupling score, ES specialisation scores, and comparative advantage score for each kebele.
#These will be used in further analysis. 

#Setup####

#libraries

library(tidyverse) # general programming
library(readxl)    # import of excel workbook sheets
library(diverse) # compute diversity measures. ('hhi','gs','s', 'e', 'bp') 
library(ineq) #computes the inequality within a vector according to the specified inequality measure 


#data locations

wd <- "" #INSERT DATA LOCATION
xldata_loc <- "Data_ES_Production_Flows.xlsx"

setwd("") #INSERT DATA LOCATION

sheetName_prod <- "production"
sheetName_kebele <- "kebele_data"
sheetName_beans <- "beans"

product_levels_prod <- c("cattle", "beef", "coffee", "euca", "honey", "khat", "maize", "sorghum", "teff")
product_levels_bean <- c("cattle", "beef", "coffee", "euca", "firewood", "honey", "khat", "maize", "sorghum", "teff")

scale_levels <- c("beans_hh", "beans_local", "beans_district", "beans_central", "beans_global")
scale_labels <- c("HH", "local", "district", "central", "global")

recode_scale_0.4 <- c("HH" = 0, "local" = 1, "district" = 2, "central" = 3, "global" = 4)

excluded_kebeles <- c("Chira Town", "Gatira Town", "Toba Town", "Gere Ifalo", "Gemina Dacho")
excluded_products <- c("khat", "firewood")




#Data import and cleaning####

#Khat is removed because it is an illegal product and only little data available
#Firewood is removed because it was difficult to estimate volumes collected

prod <- read_excel(paste0(wd, xldata_loc), sheet=sheetName_prod)
kebele <- read_excel(paste0(wd, xldata_loc), sheet=sheetName_kebele)
bean <- read_excel(paste0(wd, xldata_loc), sheet=sheetName_beans)

#Production of ES per kebele (production)
prod2 <- prod %>% 
  dplyr::select(woreda:area_prod) %>% 
  dplyr::filter(kebele!="NaN") %>% 
  mutate(woreda = factor(woreda),
         product = factor(product, labels = product_levels_prod)) %>% 
  filter(!product %in% excluded_products) %>%
  filter(!kebele %in% excluded_kebeles)

#Bean exercise data, showing spatially disaggregated ES flows to five spatial scales, collected for 12 kebeles
bean2 <- bean %>% 
  dplyr::select(woreda:beans_global) %>% 
  dplyr::filter(kebele!="average household") %>% 
  pivot_longer(cols = beans_hh:beans_global, names_to = "scale", values_to = "count") %>% 
  mutate(woreda = factor(woreda),
         product = factor(product, labels = product_levels_bean),
         scale = factor(scale, levels = scale_levels, labels = scale_labels),
         count = as.integer(count)) %>% 
  filter(!product %in% excluded_products)


#Data development####

#Telecoupling score for each kebele (tele_score)####


#We generate a per-ecosystem service (ES, product) telecoupling score.
#We summarise the mean bean counts by scale and ES (i.e. mean across 12 kebeles),
#and multiply this by a scale "score" (coded by recode_scale_1.4).
#Then we take the sum of these per ES ( =countscore).

teleconnect <- bean2 %>% 
  group_by(scale, product) %>% 
  summarise(count = mean(count, na.rm = TRUE)) %>% 
  mutate(score = recode(scale, !!!recode_scale_0.4),
         countscore = count*score) %>%
  group_by(product) %>% 
  summarise(countscore = sum(countscore, na.rm = TRUE))


#We adjust ES production in each kebele for total kebele area (to account for differences in kebele area).
#We robust scale ES production data for each ES (to make them comparable across ES).
#We add a constant so that all values are larger than 0 (or 1).

#Robust scaling: uses the median and interquartile range instead of the mean and sd
#when centring and scaling a vector, to remove the influence of the outliers on the scaling process

robust_scalar <- function(x){(x- median(x)) /(quantile(x,probs = .75)-quantile(x,probs = .25))}


production_adj_w0 <- prod2 %>% 
  select(woreda, kebele, product, production) %>% 
  left_join(kebele %>% select(kebele, area_total)) %>% 
  mutate(production_adj = production/area_total) %>% 
  select(-production, -area_total) %>% 
  group_split(product) %>% 
  purrr::map_dfr(~ .x %>% mutate(production_adj_rs = robust_scalar(production_adj))) %>% #robust scaled production
  mutate(production_adj_rsm = production_adj_rs - min(production_adj_rs), #all values > 0
         production_adj_rst = production_adj_rsm + 1) #all values > 1

production_adj <- production_adj_w0


#We multiply the production of each ES in each kebele
#(production_adj_rsm = production adj by total kebele area, robust scaled, >0)
#with the per-ES telecoupling score created above
#and sum across ES for each kebele.

tele <- production_adj %>% 
  left_join(teleconnect) %>% 
  mutate(tele_score = production_adj_rsm*countscore) %>% 
  group_by(woreda, kebele) %>% 
  summarise(tele_score = sum(tele_score, na.rm = TRUE))



#ES specialization for each kebele (spec)####

#Here we calculate Simpson's index based on ES production in each kebele
#to measure how specialized kebeles are in the ES they produce.
#(using production_adj_rst = total kebele area adjusted, robust scaled, and transformed so that all values > 1).

#gini-simpson (simpson's index, infinite version) and shannon index
specialisation <- production_adj %>% 
  select(kebele, product, production_adj_rsm) %>%
  mutate(across(c(kebele, product), factor)) %>%
  as.data.frame() %>%
  diverse::diversity(type = c('gs'))%>%
  as_tibble(rownames = "kebele") %>%
  select(kebele, spec_gini_simpson = gini.simpson.C)




#Comparative advantage for each kebele (comp_adv)####

#Here we calculate Simpson's index based on ES productivity in each kebele
#to measure how specialized kebeles are in terms of their ES productivities.
#We calculate productivity values based on ES production and area used for ES production.
#We robust scale the ES productivity data for each ES (to make them comparable across ES).
#We add a constant so that all values are larger than 0 (0r 1).
#For robustness checks, we calculate additional diversity indices.


productivity_w0 <- prod2 %>% 
  mutate(productivity = (production/area_prod) %>% replace_na(0)) %>% 
  select(-production, -area_prod) %>% 
  group_split(product) %>% 
  map_dfr(~ .x %>% mutate(productivity_rs = robust_scalar(productivity))) %>% 
  mutate(productivity_rsm = productivity_rs - min(productivity_rs),
         productivity_rst = productivity_rsm + 1)

productivity <- productivity_w0

#gini-simpson (simpson's index, infinite version) and shannon index
comp_adv <- productivity %>% 
  select(kebele, product, productivity_rsm) %>%
  mutate(across(c(kebele, product), factor)) %>%
  as.data.frame() %>%
  diverse::diversity(type = c('gs'))%>%
  as_tibble(rownames = "kebele") %>%
  select(kebele, comp_adv_gini_simpson = gini.simpson.C)




#Additional variables####


#Forest share/pop density/female pop

kebele <- kebele %>% 
  mutate(forest_share = forest/area_total, 
         pop_dens = total_pop/area_total,
         female_pop = female/total_pop)


#Landscape diversity
#based on area (in ha) of 12 LULC types

kebele2 <- kebele %>% 
  filter(!kebele %in% excluded_kebeles) %>% 
  select(kebele, arable:town) %>% 
  pivot_longer(cols = arable:town, names_to = "lulc", values_to = "area") %>% 
  mutate(area = area - min(area) + 1) %>% 
  mutate(across(c(kebele, lulc), factor)) %>% 
  as.data.frame() %>% 
  diversity(type = c('gs')) %>% 
  as_tibble(rownames = "kebele") 

# select the index here  
div_lulc <- kebele2 %>% 
  select(kebele, lulc_div = gini.simpson)


#Remoteness

#Robust scale distance from nearest town, and nearest road, adjust all values to be larger than zero, then sum.
#(robust scale to ensure equal weight for the two distance measures)
#values changed from original version, but still same order of kebeles
#(because towns excluded, and each of the two variables >0 separately)

remoteness <- kebele %>%
  filter(!kebele %in% excluded_kebeles) %>% 
  select(kebele, distance_town, distance_road) %>% 
  mutate(across(distance_town:distance_road, robust_scalar)) %>% 
  mutate(distance_town = distance_town - min(distance_town),
         distance_road = distance_road - min(distance_road),
         remoteness = distance_town + distance_road) %>% 
  select(-distance_town, -distance_road)


#Combine and save df####
df <- tele %>% 
  left_join(specialisation) %>%
  left_join(comp_adv) %>%
  left_join(kebele %>% select(kebele, altitude_mean, female_pop, forest_share, pop_dens, wealth)) %>% 
  left_join(div_lulc) %>% 
  left_join(remoteness) %>% 
  ungroup()


save(df,file="df_es_spec.RData")






#DATA ANALYSIS####

# Data analysis of telecoupling, ES specialisation, and comparative advantage.
# Data on ES production and flows from Gera, Gumay & Setema woreda, Jimma zone, Ethiopia.


#Aim of analysis####

# The goal of this paper is to analyze the relationship between telecoupling, specialization and comparative advantage
# in a smallholder agricultural landscape in Ethiopia.
# We try to understand these relationships across all kebeles (= smallest administrative units in Ethiopia)
# in the study area, but also by differentiating kebeles in the study area into groups
# based on their farming systems.
# To achieve this, we
# (1) define indices to measure telecoupling, specialization and comparative advantage in each kebele
# based on ecosystem service production and flow data, (done in dataframe generation)
# (2) analyze the relationship between telecoupling and ecosystem service specialization,
# and between specialization and comparative advantage across kebeles in the study area,
# (3) analyze how these relationships differ between kebele groups by farming type, and
# (4) explore the role of other potential (social and ecological) driving factors for specialization.

#Definitions:
#   
#Specialisation: degree of concentration of ES production in each kebele
#a kebele is more specialized if its ES production is more concentrated and less evenly distributed
#Comparative advantage: degree of concentration of ES productivities in each kebele
#a kebele has a higher value for comparative advantage if its ecosystem service productivities are more concentrated and less evenly distributed
#Telecoupling: degree of telecoupling in terms of ES production and flows
#

#Setup####


#libraries
library(bestNormalize) #suite of transformation-estimating functions that can be used to normalize data
library(ggfortify) # for plotting (autoplot) of lm objects
library(gvlma) #Global Validation of Linear Models Assumptions
library(MuMIn) #Tools for performing model selection and model averaging
library(tidyverse) # general programming
library(readxl) # import of excel workbook sheets
library(factoextra) # Extract and Visualize the Results of Multivariate Data Analyses
library(FactoMineR) # Multivariate Exploratory Data Analysis and Data Mining
library(rstatix) # Pipe-Friendly Framework for Basic Statistical Tests
library(ggpubr)    # visualization publication ready plots
library(sjPlot) #Collection of plotting and table output functions for data visualization
library(PerformanceAnalytics) #econometric functions for performance and risk analysis
library(GGally) #to build correlograms
library(cowplot)
library(ggsci)
library(ragg)
library(gridExtra)

#Data import
#load(file = paste0(wd, "df_es_spec.RData"))

prod2 <- prod %>% 
  dplyr::select(woreda:area_prod) %>% 
  dplyr::filter(kebele!="NaN") %>% 
  mutate(woreda = factor(woreda),
         product = factor(product, labels = product_levels_prod)) %>% 
  filter(!product %in% excluded_products)


product_levels_bean <- c("Cattle\n(13.00)", "Beef\n(51.58)", "Coffee\n(30.33)", "Eucalyptus\n(12.67)",
                         "Firewood", "Honey\n(27.17)", "Khat", "Maize\n(14.33)", "Sorghum\n(13.25)", "Teff\n(15.75)")

bean2 <- bean %>% 
  dplyr::select(woreda:beans_global) %>% 
  dplyr::filter(kebele!="average household") %>% 
  pivot_longer(cols = beans_hh:beans_global, names_to = "scale", values_to = "count") %>% 
  mutate(woreda = factor(woreda),
         product = factor(product, labels = product_levels_bean),
         scale = factor(scale, levels = scale_levels, labels = scale_labels),
         count = as.integer(count)) %>% 
  filter(!product %in% excluded_products)



#ES flows####

es_score <- bean2 %>% 
  group_by(scale, product) %>% 
  summarise(count = mean(count, na.rm = TRUE)) 


teleconnect <- bean2 %>% 
  group_by(scale, product) %>% 
  summarise(count = mean(count, na.rm = TRUE)) %>%
  mutate(score = recode(scale, !!!recode_scale_0.4),
         countscore = count*score) %>%
  group_by(product) %>%
  summarise(countscore = sum(countscore, na.rm = TRUE))


jco <- c("HH" = "#0073C2FF", "local" = "#7AA6DCFF", "district" = "#868686FF",
         "central" = "#EFC000FF", "global" = "#CD534CFF")

#png(file="Fig3.png", width = 886, height = 500)
ggplot(data = es_score, mapping = aes(x = product, y = count, fill = scale)) + 
  geom_bar(stat="identity", position = position_stack(reverse = TRUE)) +
  xlab("Ecosystem service") +
  ylab("Average number of beans")+
  scale_fill_manual(values = jco, name = "Scale") +
  scale_x_discrete(limits = c("Beef\n(51.58)", "Cattle\n(13.00)", "Coffee\n(30.33)", "Eucalyptus\n(12.67)",
                              "Honey\n(27.17)", "Maize\n(14.33)", "Sorghum\n(13.25)", "Teff\n(15.75)")) +
  theme_classic(base_size = 20) +
  theme(axis.text.x = element_text(size= 13))
#dev.off()



#Kebele clustering####

#We cluster kebeles based on their ES production (robust scaled).
#We will use them to see if the resulting groups can explain differences in the correlations
#between our three main variables.

#(robust scaled ES production data: to make ES comparable
#but not adjusted for kebele area: for clustering we are interested in what kebeles actually produce
#at an absolute level, independent of their area)

robust_scalar <- function(x){(x- median(x)) /(quantile(x,probs = .75)-quantile(x,probs = .25))}

prod_rs <- prod2 %>% 
  dplyr::select(woreda, kebele, product, production) %>% 
  group_split(product) %>% 
  purrr::map_dfr(~ .x %>% mutate(production_rs = robust_scalar(production))) %>% #robust scale
  dplyr::select(-production)

prod_rs_wide <- prod_rs %>% #widen data set
  pivot_wider(names_from = product, values_from = production_rs)

prod_rs_wide <- prod_rs_wide[-c(7,9),]

prod_rs_wide <- as.data.frame(prod_rs_wide)
rownames(prod_rs_wide) <- prod_rs_wide$kebele
dist <- dist(x = prod_rs_wide[,c(3:10)], method = "euclidean")
clust <- hclust(d = dist, method = "ward.D2")

#look at the dendogram  and judge based on group interpretability 
plot(clust) #three groups

prod_rs_wide$clust <- cutree(clust, k = 3)

plot(clust, hang = -1, xlab = NULL, sub = NULL, main = NULL, cex.lab=1.5) +
  rect.hclust(clust, k=3, border= c("#999999", "#E69F00", "#56B4E9")) 

prod_rs_wide$clust <- as.factor(prod_rs_wide$clust)
levels(prod_rs_wide$clust) <- c("Beef", "Coffee/Honey", "Mixed")
summary(prod_rs_wide$clust)

df <- df %>% 
  left_join(prod_rs_wide[,c(2,11)])


#apply PCA to visualize results

prod_rs_wide <- prod_rs_wide %>%
  rename(Cattle = cattle, Beef = beef, Coffee = coffee, Eucalyptus = euca, Honey = honey, Maize = maize, Sorghum = sorghum, Teff = teff)

res.pca <- PCA(prod_rs_wide[,c(3:10)], scale.unit = FALSE, graph = FALSE)

#png(file="Fig4.png", width = 886, height = 500, bg = "transparent", type = "cairo")
fviz_pca_biplot(res.pca,
                col.ind = prod_rs_wide$clust, palette = "jco",
                addEllipses = FALSE, mean.point = FALSE, label = "var", #concentration ellipse
                col.var = "black", repel = TRUE,
                legend.title = " Cluster", title = NULL,
                pointsize = 4, labelsize = 6) +
  theme_classic(base_size = 20) +
  labs(x= "PCA1 (33.4%)", y = "PCA2 (23.8%)")
#dev.off()




#Correlation analyses####

#are variables normally distributed?
df %>% shapiro_test(tele_score, spec_gini_simpson, comp_adv_gini_simpson) 
#not normally distributed

# --> pearson not appropriate
#"In the normal case, Kendall correlation is more robust and efficient than Spearman correlation.
#It means that Kendall correlation is preferred when there are small samples or some outliers."
#(https://datascience.stackexchange.com/questions/64260/pearson-vs-spearman-vs-kendall)

df <- rename(df, Cluster = clust)


#Spec-tele
p <- ggscatter(df, x = "tele_score" , y = "spec_gini_simpson", conf.int = FALSE, 
               cor.coef = FALSE, cor.method = "kendall", shape="Cluster", color= "Cluster", size = 4, palette = "jco") +
  xlab("Telecoupling") +
  ylab("Specialization") +
  theme_classic(base_size = 20) +
  stat_cor(method = "kendall", cor.coef.name = c("tau"), size = 6, label.y = 0.13, label.x = 400, p.accuracy = 0.001) +
  stat_cor(aes(color = df$Cluster), method = "kendall", cor.coef.name = c("tau"), size = 6, label.y = c(0.235,0.245,0.255))

xdens <- axis_canvas(p, axis = "x")+
  geom_density(data = df, aes(x = tele_score, fill = Cluster),
               alpha = 0.7, size = 0.2)+
  ggpubr::fill_palette("jco")

ydens <- axis_canvas(p, axis = "y", coord_flip = TRUE)+
  geom_density(data = df, aes(x = spec_gini_simpson, fill = Cluster),
               alpha = 0.7, size = 0.2)+
  coord_flip()+
  ggpubr::fill_palette("jco")

p1 <- insert_xaxis_grob(p, xdens, grid::unit(.2, "null"), position = "top")
p2<- insert_yaxis_grob(p1, ydens, grid::unit(.2, "null"), position = "right")

#png(file="Fig5.png", width = 886, height = 500, bg = "transparent", type = "cairo")
ggdraw(p2)
#dev.off()


#Spec-comp_adv
p <- ggscatter(df, x = "comp_adv_gini_simpson" , y = "spec_gini_simpson", conf.int = FALSE,
               cor.coef = FALSE, cor.method = "kendall", shape="Cluster", color= "Cluster", size = 4, palette = "jco") +
  xlab("Comparative advantage") +
  ylab("Specialization") +
  theme_classic(base_size = 20) +
  stat_cor(method = "kendall", cor.coef.name = c("tau"), size = 6, label.y = 0.13, label.x = 0.18, p.accuracy = 0.001) +
  stat_cor(aes(color = df$Cluster), method = "kendall", cor.coef.name = c("tau"), size = 6, label.y = c(0.235,0.245,0.255))

xdens <- axis_canvas(p, axis = "x")+
  geom_density(data = df, aes(x = comp_adv_gini_simpson, fill = Cluster),
               alpha = 0.7, size = 0.2)+
  ggpubr::fill_palette("jco")


ydens <- axis_canvas(p, axis = "y", coord_flip = TRUE)+
  geom_density(data = df, aes(x = spec_gini_simpson, fill = Cluster),
               alpha = 0.7, size = 0.2)+
  coord_flip()+
  ggpubr::fill_palette("jco")

p1 <- insert_xaxis_grob(p, xdens, grid::unit(.2, "null"), position = "top")
p2<- insert_yaxis_grob(p1, ydens, grid::unit(.2, "null"), position = "right")

#png(file="SM_Corr_Spec_CompAdv.png", width = 886, height = 500, bg = "transparent", type = "cairo")
ggdraw(p2)
#dev.off()





#Linear model####

#we want to confirm if comparative advantage and/or telecoupling are correlated with/cause specialisation
#we want to see what else might explain specialisation

#dependent variable: spec
#focus predictors: comp_adv, tele_score
#independent variables: altitude_mean, female_pop, forest_share, lulc_div, pop_dens, remoteness, wealth


#Data exploration####

#Zuur, A.F., Ieno, E.N., Elphick, C.S., 2010. A protocol for data exploration to avoid common statistical problems. Methods Ecol Evol 1 (1), 3-14. 10.1111/j.2041-210X.2009.00001.x.

summary(df)
str(df)


#Outliers y and x
df_long <- df %>% 
  pivot_longer(tele_score:remoteness)

ggplot(data = df_long, 
       aes(x = name, y = value)) + 
  geom_boxplot(outlier.color = "red") +
  facet_wrap(~name, scales = "free")

#altitude mean: no outliers
#comp_adv: three outliers --> keep, productivity data have been cross-checked
#comp_adv_gini: one outlier --> keep, productivity data have been cross-checked
#comp_adv_gini_simpson: four outlier --> keep, productivity data have been cross-checked
#female pop: one outlier --> keep, is official data
#forest_share: no outliers
#lulc_div: some outliers with low values --> keep, all have high forest_share, which is based on satellite images 
#pop_dens: one outlier --> keep, is official data
#remoteness: four outliers with high values --> keep, based on satellite images
#spec: three outliers --> will later be resolved through transformation
#spec_gini: no outliers
#spec_gini_simpson: no outliers
#tele_score: three outliers --> might later be resolved through transformation 
#wealth: no outliers


# Homogeneity of variance
#(will be checked in model validation)



#Normality y

#Here we test for normality in spec, and use the bestNormalize package to determine
#what might be the best transformation. We apply this, and then test again.

ggpubr::ggdensity(df$spec_gini_simpson)
ggpubr::ggqqplot(df$spec_gini_simpson)
rstatix::shapiro_test(df$spec_gini_simpson)

df$spec_gini_simpson %>% bestNormalize::bestNormalize(allow_orderNorm = FALSE, out_of_sample = FALSE)

xnew <- df$spec_gini_simpson %>% bestNormalize::boxcox() %>% .$x.t
ggpubr::ggdensity(xnew)
ggpubr::ggqqplot(xnew)
rstatix::shapiro_test(xnew)

#better, but not perfect


#Zero trouble y
sum(df$spec_gini_simpson == 0) 
#No zeros in the outcome, so no trouble


#Collinearity x, relationships y and x

df %>% 
  select(spec_gini_simpson, tele_score, comp_adv_gini_simpson, altitude_mean:remoteness) %>% 
  PerformanceAnalytics::chart.Correlation(., method = "kendall")

#okay, no corr > .5
#can also see some relationships between y and x


#Independence y
#(will be checked in model validation)


#Interactions

#we only include main potential interactions, to not overburden our model
#from theoretical considerations, we expect interactions between:

#comp_adv, altitude and landscape diversity
#(altitude as a proxy for forest share, which will most likely drop out due to collinearity issues)

#Altitude was classified into five ranges (<1300m, 1300-1500m, 1500-2100m, 2100-2300m, >2300m),
#mainly based on the altitudes where coffee growing is viable,
#both for currently suitable ranges and a projected future altitudinal shift until 2040.
#In our sample: no altitude under 1500, none over 2300 

df$altitude_cat <- cut(df$altitude_mean, 
                       breaks=c(0, 2100, Inf), 
                       labels=c("coffee", "no coffee"))

ggscatter(df, x = "comp_adv_gini_simpson", y = "spec_gini_simpson", add = "reg.line", conf.int = TRUE,
          cor.coef = TRUE, cor.method = "kendall", facet.by = "altitude_cat") +
  xlab("Comparative advantage (comp_adv_gini_simpson)") +
  ylab("Specialization (spec_gini_simpson)") +
  theme(text = element_text(size = 20))
#stronger relationship between comp_adv and spec for higher altitude kebeles
#possible explanation: in higher altitudes, we have less forest, so spec is driven more by comp_adv,
#because people actually have a choice where to specialize (with high forest share, you specialize in forest ES)


ggscatter(df, x = "lulc_div", y = "spec_gini_simpson", add = "reg.line", conf.int = TRUE,
          cor.coef = TRUE, cor.method = "kendall", facet.by = "altitude_cat") +
  xlab("Landscape diversity (lulc_div)") +
  ylab("Specialization (spec_gini_simpson)") +
  theme(text = element_text(size = 20))
#relationship more pronounced in high altitudes


ggscatter(df, x = "lulc_div", y = "spec_gini_simpson", add = "reg.line", conf.int = TRUE,
          cor.coef = TRUE, cor.method = "kendall") +
  xlab("Landscape diversity (lulc_div)") +
  ylab("Specialization (spec_gini_simpson)") +
  theme(text = element_text(size = 20)) +
  facet_wrap(~ cut_number(df$comp_adv_gini_simpson, 2))
#for kebeles with low comp_adv values, the negative relationship between lulc_div and spec is more pronounced
#possible explanation: without comparative advantages, the incentive to specialize is based on land cover 


#forest_share
ggscatter(df, x = "altitude_mean", y = "forest_share", add = "reg.line", conf.int = TRUE,
          cor.coef = TRUE, cor.method = "kendall") +
  xlab("Mean altitude (altitude_mean)") +
  ylab("Forest Share (forest_share)") +
  theme(text = element_text(size = 20))
#significant negative relationship between altitude and forest share
#one will most likely drop out of the model due to collinearity


ggscatter(df, x = "forest_share", y = "lulc_div", add = "loess", conf.int = TRUE,
          cor.coef = TRUE, cor.method = "kendall") +
  xlab("Forest Share (forest_share)") +
  ylab("Landscape diversity (lulc_div)") +
  theme(text = element_text(size = 20))
#significant negative relationship between forest share and landscape diversity
#one will most likely drop out of the model due to collinearity




#Variable transformations####
#Models attempting to find explanations often benefit from centred and scaled predictors.
#We also saw above the likely benefit of transforming spec (using boxcox).


spec_gini_simpson_bcMod <- df$spec_gini_simpson %>% bestNormalize::boxcox(standardize = TRUE) 
comp_adv_gini_simpson_yjMod <- df$comp_adv_gini_simpson %>% bestNormalize::yeojohnson(standardize = TRUE)
lulc_div_yjMod <- df$lulc_div %>% bestNormalize::yeojohnson(standardize = TRUE)
female_pop_yjMod<- df$female_pop %>% bestNormalize::yeojohnson(standardize = TRUE) 
remoteness_yjMod<- df$remoteness %>% bestNormalize::yeojohnson(standardize = TRUE)
tele_score_yjMod<- df$tele_score %>% bestNormalize::yeojohnson(standardize = TRUE) 
altitude_meanMod <- df$altitude_mean %>% bestNormalize::center_scale()

df_ts <- df %>% 
  dplyr::select(kebele, altitude_mean, pop_dens, wealth) %>%
  mutate(across(altitude_mean:wealth, scale))

df_ts$altitude_mean <- as.numeric(df_ts$altitude_mean)
df_ts$pop_dens <- as.numeric(df_ts$pop_dens)
df_ts$wealth <- as.numeric(df_ts$wealth)

df_ts2 <- tibble(
  kebele= df$kebele,
  spec_gini_simpson_bc = spec_gini_simpson_bcMod$x.t,
  comp_adv_gini_simpson_yj = comp_adv_gini_simpson_yjMod$x.t,
  lulc_div_yj = lulc_div_yjMod$x.t,
  female_pop_yj = female_pop_yjMod$x.t,
  remoteness_yj = remoteness_yjMod$x.t,
  tele_score_yj = tele_score_yjMod$x.t)

df_ts <- df_ts %>%
  left_join(df_ts2, by = "kebele")




#Model fitting####
#with interactions for comp_adv and altitude and landscape diversity
#excluding forest share beforehand due to collinearity 


lm1 <- lm(spec_gini_simpson_bc ~ comp_adv_gini_simpson_yj + tele_score_yj + altitude_mean + lulc_div_yj +
            remoteness_yj + female_pop_yj + pop_dens + wealth + 
            altitude_mean*comp_adv_gini_simpson_yj + altitude_mean*lulc_div_yj +
            comp_adv_gini_simpson_yj*lulc_div_yj, df_ts)
summary(lm1)
autoplot(lm1)
car::vif(lm1) #tele_score is at VIF 2.8, so > 2.5, but also way below 4, keep in

lm1both <- MASS::stepAIC(lm1, direction = "both", trace = TRUE) 
summary(lm1both)
autoplot(lm1both) 
car::vif(lm1both)



#Model validation####

#overview
autoplot(lm1both)
summary(gvlma(lm1both))
par(mar = rep(2, 4))
plot(gvlma(lm1both))

#linearity
plot(lm1both, 1) #the red line should be approximately horizontal at zero
#good

#normality of residuals
plot(lm1both, 2) #normal probability plot of residuals should approximately follow a straight line
sresid <- MASS::studres(lm1both) 
shapiro.test(sresid) #Shapiro-Wilk Normality Test
#good

#homogeneity of variance (homoscedasticity)
plot(lm1both, 3) #the red line should be horizontal with equally spread points
car::ncvTest(lm1both) #non-constant error variance test
#okay


#outliers and high leverage points
plot(lm1both, 5)

#OUTLIERS
# Outliers can be identified by examining the standardized residual (or studentized residual),
# which is the residual divided by its estimated standard error.
# Standardized residuals can be interpreted as the number of standard errors away from the regression line.
# Observations whose standardized residuals are greater than 3 in absolute value are possible outliers
#-->all lower than 3

#HIGH LEVERAGE
# A data point has high leverage, if it has extreme predictor x values.
# This can be detected by examining the leverage statistic or the hat-value.
# A value of this statistic above 2(p + 1)/n indicates an observation with high leverage (P. Bruce and Bruce 2017);
# where, p is the number of predictors and n is the number of observations.
#leverage statistic above 2(p + 1)/n: high leverage
#p is the number of predictors and n is the number of observations, p=10, n=61
#2(p + 1)/n = 22/61 = 0.36 
which(hatvalues(lm1both)>0.36)
#-->one point has higher leverage statistic: 7

#INFUENTIAL VALUES
plot(lm1both, 4) #high influence if Cook's distance exceeds 4/(n - p - 1) = 4/(61-10-1) = 0.08
model.diag.metrics <- augment(lm1both)
model.diag.metrics %>%
  top_n(5, wt = .cooksd)
#-->observations 1, 7 and 20 exceed
#When data points have high Cook's distance scores
#and are to the upper or lower right of the leverage plot,
#they have leverage meaning they are influential to the regression results.
plot(lm1both, 5)
#1 and 20 have high Cook's distance scores, but no high leverage
#7 has high leverage statistic, and relatively high Cook's, but at 0.5, so still okay
#we know that the data point has exceptionally high values for comp adv
#https://www.statisticshowto.com/cooks-distance/
#okay

#independence/autocorrelation
car::durbinWatsonTest(lm1both)
acf(lm1both$residuals)
#okay



#Effect plot####

nOut <- 300 
qnX2 <- c(0.25, 0.5, 0.75)   
X1 <- "altitude_mean"
X2 <- "comp_adv_gini_simpson_yj"
minX1 <- min(df_ts[X1])
maxX1 <- max(df_ts[X1])
mX2 <- df_ts[X2] %>% pull() %>% quantile(qnX2)

newData_yj <- tibble(
  altitude_mean = seq(from=minX1, to=maxX1, length.out=nOut) %>% rep(length(qnX2)),
  comp_adv_gini_simpson_yj = rep(mX2, each = nOut),
  lulc_div_yj = rep(0),
  female_pop_yj = rep(0),
  remoteness_yj = rep(0),
  tele_score_yj = rep(0),
  pop_dens = rep(0),
  wealth = rep(0)
)


#predict over this new data
newPreds_bc <- predict.lm(lm1both, newData_yj, interval='confidence') %>% as_tibble()

#Then we can un-transform all of these - and factor the factor
newOut <- tibble(
  spec_gini_simpson_fit = predict(spec_gini_simpson_bcMod, newdata=newPreds_bc$fit, inverse=TRUE),
  spec_gini_simpson_lwr = predict(spec_gini_simpson_bcMod, newdata=newPreds_bc$lwr, inverse=TRUE),
  spec_gini_simpson_upr = predict(spec_gini_simpson_bcMod, newdata=newPreds_bc$upr, inverse=TRUE),
  altitude_mean = predict(altitude_meanMod, newdata=newData_yj$altitude_mean, inverse=TRUE),
  comp_adv_gini_simpson_yjF = predict(comp_adv_gini_simpson_yjMod,
                                      newdata=newData_yj$comp_adv_gini_simpson_yj, inverse=TRUE) %>% as.factor()
)

levels(newOut$comp_adv_gini_simpson_yjF) <- c("q(25%)=0.1272", "q(50%)=0.1292", "q(75%)=0.1339")

#png(file="Fig6.png", width = 886, height = 500, bg = "transparent", type = "cairo")
ggplot(newOut) +
  geom_ribbon(aes(x=altitude_mean, 
                  ymin=spec_gini_simpson_lwr, 
                  ymax=spec_gini_simpson_upr,
                  fill= comp_adv_gini_simpson_yjF), alpha = 0.3) +
  scale_fill_viridis_d(name = "Comparative advantage") +
  geom_line(aes(x=altitude_mean, 
                y=spec_gini_simpson_fit,
                color= comp_adv_gini_simpson_yjF)) +
  scale_color_viridis_d(guide ="none") +
  xlab("Mean altitude") +
  ylab("Specialization") +
  scale_x_continuous(expand = c(0,0)) +
  scale_y_continuous(expand = c(0,0)) +
  theme_classic(base_size = 20)
#dev.off()





















# #effect plots
# p <- ggpredict(lm1both, terms = "tele_score")
# ggplot(p, aes(x, predicted)) + geom_line() +
#   xlab("Telecoupling") +
#   ylab("Predicted values for specialization")
# 
# p <- ggpredict(lm1both, terms = "altitude_mean")
# ggplot(p, aes(x, predicted)) + geom_line() +
#   xlab("Mean altitude") +
#   ylab("Predicted values for specialization")
# 
# p <- ggpredict(lm1both, terms = c("comp_adv_gini_simpson","altitude_mean[-1,0,1]"))
# ggplot(p, aes(x, predicted, color = group)) + geom_line() +
#   xlab("Comparative advantage") +
#   ylab("Predicted values for specialization")
# 
# ggeffect(lm1both, terms = "tele_score") %>% 
#   plot()
#  
# 
# ggeffect(lm1both, terms = c("comp_adv_gini_simpson","altitude_mean")) %>% 
#   plot()
# 
# 
# interplot(m = lm1both, var1 = "altitude_mean", var2 = "comp_adv_gini_simpson") +
#   xlab("Mean altitude") +
#   ylab("Estimated coefficient for comparative advantage") +
#   geom_hline(yintercept = 0, linetype = "dashed") +
#   theme(text = element_text(size = 20))