ERF_Paper_Code_Output.Rmd

---
title: "ERF Paper"
author: "Tegveer Ghura"
date: "2023-02-10"
output:
  html_document:
    number_sections: yes
---

```{r setup, include=FALSE}
# import packages
library(naivebayes)
library(klaR)
library(e1071)
library(caTools)
library(caret)
library(tidyverse)
library(kableExtra)
library(stargazer)
library(flextable)
library(dplyr)
library(leaps)
library(caret)
library(ggplot2)
library(ggExtra)
library(tidyr)
library(ISLR2)
library(ISLR)
library(readr)
library(reticulate)
library(gtsummary)
library(stargazer)
library(randomForest)
library(ROCR)
library(pROC)
```

```{r df, warning=FALSE, message=FALSE}
# df import and subset
df <- read_csv('Data/REKT_Database_Clean_Python.csv') 
df <- subset(df, select = -c(...1, token_name, description, name_categories))
#df <- df %>% filter(funds_lost!=0)
```

```{r cleantext}
# Removing dictionary values from the scam_type column
df$scam_type <- gsub("[^:]*,[^:]*", "",df$scam_type)
df$scam_type <- gsub("'id'::", "",df$scam_type)
df$scam_type <- gsub("\\{|\\}", "",df$scam_type)
df$scam_type <- gsub("'", "",df$scam_type)
df$scam_type <- gsub("type: ", "",df$scam_type)
df$scam_type <- gsub(" ", "",df$scam_type)

# Removing list brackets from the scamNetworks column
df$scamNetworks <- gsub("\\[|\\]", "", df$scamNetworks)
df$scamNetworks <- gsub("'", '', df$scamNetworks)
df$scamNetworks <- gsub(", +", ",", df$scamNetworks) # remove whitespace after comma for grouping later
```

```{r scamtype}
# pooling together scam types into respective types 
df <- df %>% 
  mutate(scam_type_grouped = if_else(scam_type=="Honeypot" | scam_type=="Rugpull" | scam_type=="Abandoned" | project_name=="Kronos Dao" | project_name=="Genesis" | project_name=="Celsius Network" | project_name=="Voyager" | project_name=="BlockFi" | project_name=="FTX Group" | project_name=="Bitcoin Sheikh" | project_name=="Trade Coin Club" | project_name=="EmpiresX" | project_name=="Vauld", "Exit Scam", "Exploit"))
df <- subset(df, select = -c(scam_type, day_of_week_of_attack, day_of_year_of_attack, date, project_name))
table(df$scam_type_grouped)
```

```{r monthname}
#only month_of_attack has NA's (1873 of them), we can impute "unknown" for them or get rid of the column. Let's first see the performance from imputation
df$month_of_attack=month.name[df$month_of_attack]
df$month_of_attack[is.na(df$month_of_attack)] <- "Unknown"
#df <- na.omit(df)
```

```{r scamnetworks}
# pooling scamNetworks into 5 levels (Eth, binance, polygon, other centralized, other decentralized)
df <- separate_rows(df,scamNetworks,sep = ",")
df <- df %>% 
  mutate(scam_networks_grouped = if_else(scamNetworks == "Avax" | scamNetworks == "Algorand" | scamNetworks == "Arbitrum" | scamNetworks == "Cosmos" | scamNetworks == "Cronos" | scamNetworks == "Elastos" | scamNetworks == "Elrond" | scamNetworks == "EOS" | scamNetworks == "Fantom" | scamNetworks == "Fuse" | scamNetworks == "Gnosis" | scamNetworks == "Harmony" | scamNetworks == "Heco" | scamNetworks == "Klaytn" | scamNetworks == "KuCoin" | scamNetworks == "Moonriver" | scamNetworks == "Near" | scamNetworks == "OKExChain" | scamNetworks == "Optimism" | scamNetworks == "Ronin" | scamNetworks == "RSK" | scamNetworks == "Solana" | scamNetworks == "TRON" | scamNetworks == "Polkadot" | scamNetworks == "Other" | scamNetworks == "Terra Classic", "Other Decentralized", scamNetworks)) 
df <- df %>% filter(scam_networks_grouped != "") # remove empty string level
df <- subset(df, select = -c(scamNetworks))
```

```{r dtype}
# specify dtypes before train test split

df$scam_networks_grouped <- as.factor(df$scam_networks_grouped)
df$scam_type_grouped <-as.factor(df$scam_type_grouped)
df$month_of_attack <-as.factor(df$month_of_attack)

# add +1 because we have zeros in funds_returned and helps avoid negative inf values

df$log_funds_lost <- log(df$funds_lost + 1)
df$log_funds_returned <- log(df$funds_returned + 1) 
df <- subset(df, select = -c(funds_lost, funds_returned))
```


```{r rfmodel}
library(caret)

set.seed(3738)

df <- df[sample(1:nrow(df)), ] # shuffle rows

train.index <- createDataPartition(df$scam_networks_grouped, 
                                   p = .8, list = FALSE)
train <- df[ train.index,]
test  <- df[-train.index,]

x_train <- train %>% select(log_funds_lost, log_funds_returned, 
                            scam_networks_grouped)
y_train <- train$scam_type_grouped

x_test <- test %>% select(log_funds_lost, log_funds_returned, 
                          scam_networks_grouped)
y_test <- test$scam_type_grouped

classifier_RF <- randomForest(x = x_train,
                             y = y_train,
                             ntree = 500)

classifier_RF

# Predicting the Test set results
y_pred = predict(classifier_RF, newdata = x_test)

# Test Confusion Matrix Metrics
cm.rf = as.matrix(table(Actual = y_test, Predicted = y_pred)) # create the confusion matrix
cm.rf

n = sum(cm.rf) # number of instances
nc = nrow(cm.rf) # number of classes
diag = diag(cm.rf) # number of correctly classified instances per class 
rowsums = apply(cm.rf, 1, sum) # number of instances per class
colsums = apply(cm.rf, 2, sum) # number of predictions per class
p = rowsums / n # distribution of instances over the actual classes
q = colsums / n # distribution of instances over the predicted classes

accuracy = sum(diag) / n 
cat("Accuracy:", accuracy)

precision = diag / colsums 
recall = diag / rowsums 
f1 = 2 * precision * recall / (precision + recall) 

data.frame(precision, recall, f1)
 
macroPrecision = mean(precision)
macroRecall = mean(recall)
macroF1 = mean(f1)
weightedF1 = ((as.numeric(table(df$scam_type_grouped)[1])*as.numeric(f1[1]))+(as.numeric(table(df$scam_type_grouped)[2])*as.numeric(f1[2])))/(as.numeric(table(df$scam_type_grouped)[1])+as.numeric(table(df$scam_type_grouped)[2]))

data.frame(macroPrecision, macroRecall, macroF1, weightedF1)
```

```{r rfmodel2}
#Evaluate variable importance

#jpeg("VarImpRF.jpg", width = 700, height = 350)
importance(classifier_RF)
varImpPlot(classifier_RF, main="Variable Importance: Random Forest")
# Close the jpeg file
#dev.off() 
```

```{r rftreeplot}
#jpeg("tree_num_41.jpg", width = 700, height = 350)
library(dplyr)
library(ggraph)
library(igraph)

tree_func <- function(final_model, 
                      tree_num) {
    
    # get tree by index
    tree <- randomForest::getTree(final_model, 
                                  k = tree_num, 
                                  labelVar = TRUE) %>%
        tibble::rownames_to_column() %>%
        # make leaf split points to NA, so the 0s won't get plotted
        mutate(`split point` = ifelse(is.na(prediction), `split point`, NA))
    
    # prepare data frame for graph
    graph_frame <- data.frame(from = rep(tree$rowname, 2),
                              to = c(tree$`left daughter`, tree$`right daughter`))
    
    # convert to graph and delete the last node that we don't want to plot
    graph <- graph_from_data_frame(graph_frame) %>%
        delete_vertices("0")
    
    # set node labels
    V(graph)$node_label <- gsub("_", " ", as.character(tree$`split var`))
    V(graph)$leaf_label <- as.character(tree$prediction)
    V(graph)$split <- as.character(round(tree$`split point`, digits = 2))
    
    # plot
    plot <- ggraph(graph, 'dendrogram') + 
        theme_bw() +
        geom_edge_link() +
        geom_node_point() +
        geom_node_text(aes(label = node_label), na.rm = TRUE, repel = TRUE) +
        geom_node_label(aes(label = split), vjust = 2.5, na.rm = TRUE, fill = "white") +
        geom_node_label(aes(label = leaf_label, fill = leaf_label), na.rm = TRUE, 
                        repel = TRUE, colour = "white", fontface = "bold", show.legend = FALSE) +
        theme(panel.grid.minor = element_blank(),
              panel.grid.major = element_blank(),
              panel.background = element_blank(),
              plot.background = element_rect(fill = "white"),
              panel.border = element_blank(),
              axis.line = element_blank(),
              axis.text.x = element_blank(),
              axis.text.y = element_blank(),
              axis.ticks = element_blank(),
              axis.title.x = element_blank(),
              axis.title.y = element_blank(),
              plot.title = element_text(size = 18))
    
    print(plot)
}

#get tree with biggest number of nodes
#tree_num <- which(classifier_RF$forest$ndbigtree == max(classifier_RF$forest$ndbigtree))

tree_func(final_model = classifier_RF, 41)

# Close the jpeg file
#dev.off() 
```


```{r rfmodelcurves}
rf_train <- predict(classifier_RF, x_train, type = 'prob')
rf_test <- predict(classifier_RF, x_test, type = 'prob')

train.rf <- prediction(rf_train[,2], y_train)
test.rf <- prediction(rf_test[,2], y_test)

rf_perf_train <- performance(train.rf, "tpr", "fpr")
rf_perf_test <- performance(test.rf, "tpr", "fpr")
```

```{r logmodel}
library(caret)
set.seed(2377)
#train.index <- createDataPartition(df$scam_networks_grouped, 
                                   #p = .8, list = FALSE)
train <- df[ train.index,]
test  <- df[-train.index,]

train$scam_type_grouped = ifelse(train$scam_type_grouped  == "Exploit", 1, 0)
test$scam_type_grouped = ifelse(test$scam_type_grouped  == "Exploit", 1, 0)
logistic_model <- glm(scam_type_grouped ~ ., data = train, family = binomial(link = "logit"))
```

```{r logmodelsummary, results='asis'}
# logistic_model_summary <- summary(logistic_model)
# logistic_model_summary
stargazer::stargazer(logistic_model,type='text',report = "vc*stp",
    ci = TRUE)
```

```{r logmodel2}
train_prob_pred <- predict(logistic_model, type = 'response', newdata = train)
test_prob_pred <- predict(logistic_model, type = 'response', newdata = test)
#y_pred = ifelse(prob_pred > 0.5, "Exploit", "Exit Scam")

# Train Confusion Matrix
y_train_pred = ifelse(train_prob_pred > 0.5, "Exploit", "Exit Scam")
y_train_pred<- as.factor(y_train_pred)
train$scam_type_grouped <- ifelse(train$scam_type_grouped ==1, "Exploit", "Exit Scam")
train$scam_type_grouped<- as.factor(train$scam_type_grouped)
(cm = table(train$scam_type_grouped, y_train_pred))

# Test Confusion Matrix
y_test_pred = ifelse(test_prob_pred > 0.5, "Exploit", "Exit Scam")
y_test_pred<- as.factor(y_test_pred)
test$scam_type_grouped <- ifelse(test$scam_type_grouped ==1, "Exploit", "Exit Scam")
test$scam_type_grouped<- as.factor(test$scam_type_grouped)
(cm = table(test$scam_type_grouped, y_test_pred)) # NAs ignored
#y_pred <- as.factor(unname(y_pred)) # for cfm plot

# Test Confusion Matrix Metrics
cm.lr = as.matrix(table(Actual = test$scam_type_grouped, Predicted = y_test_pred)) # create the confusion matrix
cm.lr

n = sum(cm.lr) # number of instances
nc = nrow(cm.lr) # number of classes
diag = diag(cm.lr) # number of correctly classified instances per class 
rowsums = apply(cm.lr, 1, sum) # number of instances per class
colsums = apply(cm.lr, 2, sum) # number of predictions per class
p = rowsums / n # distribution of instances over the actual classes
q = colsums / n # distribution of instances over the predicted classes

accuracy = sum(diag) / n 
cat("Accuracy:", accuracy)

precision = diag / colsums 
recall = diag / rowsums 
f1 = 2 * precision * recall / (precision + recall) 

data.frame(precision, recall, f1)
 
macroPrecision = mean(precision)
macroRecall = mean(recall)
macroF1 = mean(f1)
weightedF1 = ((as.numeric(table(df$scam_type_grouped)[1])*as.numeric(f1[1]))+(as.numeric(table(df$scam_type_grouped)[2])*as.numeric(f1[2])))/(as.numeric(table(df$scam_type_grouped)[1])+as.numeric(table(df$scam_type_grouped)[2]))

data.frame(macroPrecision, macroRecall, macroF1, weightedF1)
```

```{r cfmtrain}
# 1. Open jpeg file
#jpeg("Train_CFM.jpg", width = 350, height = 350)

library(scales)
ggplotConfusionMatrix <- function(m){
  mytitle <- paste("Train Accuracy", percent_format()(m$overall[1]))
  p <-
    ggplot(data = as.data.frame(m$table) ,
           aes(x = Prediction, y = Reference)) +
    geom_tile(aes(fill = log(Freq)), 
              colour = "white", show.legend = FALSE) +
    scale_fill_gradient(low = "white", high = "#56B1F7") +
    geom_text(aes(x = Prediction, y = Reference, 
                  label = Freq)) +
    ggtitle(mytitle) + 
    scale_x_discrete(limits = rev) +
    theme_minimal() + 
    theme(panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank())
  return(p)
}
cfm_train <- confusionMatrix(train$scam_type_grouped, y_train_pred)
ggplotConfusionMatrix(cfm_train)

# Close the jpeg file
#dev.off() 
```


```{r cfmtest}
# Open jpeg file
#jpeg("Test_CFM.jpg", width = 350, height = 350)

library(scales)
ggplotConfusionMatrix <- function(m){
  mytitle <- paste("Test Accuracy", percent_format()(m$overall[1]))
  p <-
    ggplot(data = as.data.frame(m$table) ,
           aes(x = Prediction, y = Reference)) +
    geom_tile(aes(fill = log(Freq)), 
              colour = "white", show.legend = FALSE) +
    scale_fill_gradient(low = "white", high = "#56B1F7") +
    geom_text(aes(x = Prediction, y = Reference, 
                  label = Freq)) +
    ggtitle(mytitle) + 
    scale_x_discrete(limits = rev) +
    theme_minimal() + 
    theme(panel.grid.major = element_blank(), 
          panel.grid.minor = element_blank())
  return(p)
}
cfm_test <- confusionMatrix(test$scam_type_grouped, y_test_pred)
ggplotConfusionMatrix(cfm_test)

# Close the jpeg file
#dev.off() 
```

```{r roctraintest, message=FALSE, warning=FALSE}
# Train and Test Data ROC-AUC Curve
train_pred <- prediction(train_prob_pred, train$scam_type_grouped)
test_pred <- prediction(test_prob_pred, test$scam_type_grouped)

# Create an ROC curve
perf_train <- performance(train_pred, measure = "tpr", x.measure = "fpr")
perf_test <- performance(test_pred, measure = "tpr", x.measure = "fpr")
```

# Plot ROC Curves and save

```{r roccurve-pdf, message=FALSE, warning=FALSE, include=FALSE, echo=FALSE}
# Open a pdf file
#pdf("ROC.pdf", width = 9.0, height = 6.00) 
op <- par(family = "serif")

# Plot the ROC curve
plot(perf_train, main = "Logistic Regression vs Random Forest AUC-ROC Curve",
    col = "#009ECE", xlab = "False Positive Rate", ylab = "True Positive Rate",
    type = "l", lwd = 1.5, family="serif", cex.main = 1.15, cex.lab = 1, cex.axis = 1.05, font.axis = 2, font.lab = 2) # c
plot(perf_test, add = T, col = "#FF9E00", lwd = 1.5)
plot(rf_perf_train, add = T, col = "darkgreen", lwd = 1.5)
plot(rf_perf_test, add = T, col = "pink", lwd = 1.5)
legend(0.75, 0.25, c("Logistic Regression Train Data", "Logistic Regression Test Data", "Random Forest Train Data", "Random Forest Test Data"), 
       col = c("#009ECE", "#FF9E00", "darkgreen", "pink"), 
       bty = "n", lwd = 1.2, cex = 0.75)
abline(0, 1, lty = 2, col = "gray") # Add y=x line
par(op)

# Close the pdf file
#dev.off()
```

```{r roccurve-jpg, message=FALSE, warning=FALSE, include=FALSE, echo=FALSE}
# Open a pdf file
#jpeg("ROC_LR_RF.jpg", width=9, height=6.0, units="in", res = 300)

op <- par(family = "serif")

# Plot the ROC curve
plot(perf_train, main = "Logistic Regression vs Random Forest AUC-ROC Curve",
    col = "#009ECE", xlab = "False Positive Rate", ylab = "True Positive Rate",
    type = "l", lwd = 1.5, family="serif", cex.main = 1.15, cex.lab = 1, cex.axis = 1.05, font.axis = 2, font.lab = 2) # c
plot(perf_test, add = T, col = "#FF9E00", lwd = 1.5)
plot(rf_perf_train, add = T, col = "darkgreen", lwd = 1.5)
plot(rf_perf_test, add = T, col = "pink", lwd = 1.5)
legend(0.60, 0.25, c("Logistic Regression Train Data", "Logistic Regression Test Data", "Random Forest Train Data", "Random Forest Test Data"), 
       col = c("#009ECE", "#FF9E00", "darkgreen", "pink"), 
       bty = "n", lwd = 1.2, cex = 0.75)
abline(0, 1, lty = 2, col = "gray") # Add y=x line
par(op)

# Close the pdf file
#dev.off()
```


```{r roccurve, message=FALSE, warning=FALSE,}
# Plot the ROC curve
op <- par(family = "serif")

# Plot the ROC curve
plot(perf_train, main = "Logistic Regression vs Random Forest AUC-ROC Curve",
    col = "#009ECE", xlab = "False Positive Rate", ylab = "True Positive Rate",
    type = "l", lwd = 1.5, family="serif", cex.main = 1.15, cex.lab = 1, cex.axis = 1.05, font.axis = 2, font.lab = 2) # c
plot(perf_test, add = T, col = "#FF9E00", lwd = 1.5)
plot(rf_perf_train, add = T, col = "darkgreen", lwd = 1.5)
plot(rf_perf_test, add = T, col = "pink", lwd = 1.5)
legend(0.60, 0.25, c("Logistic Regression Train Data", "Logistic Regression Test Data", "Random Forest Train Data", "Random Forest Test Data"), 
       col = c("#009ECE", "#FF9E00", "darkgreen", "pink"), 
       bty = "n", lwd = 1.2, cex = 0.75)
abline(0, 1, lty = 2, col = "gray") # Add y=x line
par(op)
```

```{r auctraintest, message=FALSE, warning=FALSE}
auc.train <- auc(train$scam_type_grouped, train_prob_pred)
cat("Area under the curve for Logistic Regression Train Set is: ", auc.train)
auc.test <- auc(test$scam_type_grouped, test_prob_pred)
cat("\nArea under the curve for Logistic Regression Test Set is: ", auc.test)
```

```{r auctraintestrf, message=FALSE, warning=FALSE}
auc.train.rf <- pROC::auc(response = y_train, predictor = rf_train[,2])
cat("Area under the curve for Random Forest Train Set is: ", auc.train.rf)
auc.test.rf <- pROC::auc(response = y_test, predictor = rf_test[,2])
cat("\nArea under the curve for Random Forest Test Set is: ", auc.test.rf)
```