#result=na.omit(dd)

# Assuming your dataset is named 'zoo_data'
# and the type column is named 'type' or similar
zoo_data <- read.csv("zoo.csv",
                     header = TRUE, sep = ",")

head(zoo_data)

# Remove or exclude the 'type' column
clustering_data <- zoo_data[, -which(names(zoo_data) == "type")]
# Remove or exclude the 'type' column
clustering_data <- clustering_data[, -which(names(clustering_data) == "name")]
clustering_data <- clustering_data[, -which(names(clustering_data) == "legs")]
clustering_data[] <- lapply(clustering_data, function(x) ifelse(x == "yes", 1, 0))
clustering_data$legs <- zoo_data$legs

# Perform k-means clustering (for example, with 3 clusters)
set.seed(123)
kmeans_result <- kmeans(clustering_data, centers = 7)

# Add cluster assignments to the data (if needed)
clustering_data$cluster <- kmeans_result$cluster

# check with the type columnexists in your original data
table(zoo_data$type, clustering_data$cluster)

# order to improve the looking
contingency_table = table(zoo_data$type, clustering_data$cluster)
contingency_table[c("insect","invertebrate","bird","amphibian","fish","reptile","mammal"),]

# You can then visualize the clusters, for example with PCA
library(ggplot2)
pca <- prcomp(clustering_data, scale. = TRUE)
pca_df <- data.frame(pca$x, cluster = factor(clustering_data$cluster))

ggplot(pca_df, aes(PC1, PC2, color = cluster)) +
  geom_point(size = 2) +
  labs(title = "Clustering of Zoo Data (PCA view)")

# add the true type back on the produced dataframe
clustering_data$true_label <- zoo_data$type


# Calculate contingency table
contingency_table <- table(zoo_data$type, clustering_data$cluster)

# Compute purity
purity <- sum(apply(contingency_table, 2, max)) / sum(contingency_table)
# Print purity
cat("Purity: ", purity, "\n")


# Calculate F1-score
# Initialize variables for precision and recall
precision <- numeric(ncol(contingency_table))
recall <- numeric(ncol(contingency_table))

# Calculate precision and recall for each cluster
for (i in 1:ncol(contingency_table)) {
  tp <- max(contingency_table[, i])  # True positives
  fp <- sum(contingency_table[, i]) - tp  # False positives
  fn <- sum(contingency_table[ ,colSums(contingency_table) != 0]) - tp  # False negatives
  
  precision[i] <- tp / (tp + fp)  # Precision
  recall[i] <- tp / (tp + fn)      # Recall
}

# Average precision and recall
avg_precision <- mean(precision, na.rm = TRUE)
avg_recall <- mean(recall, na.rm = TRUE)

# Calculate F1-score
f1_score <- 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall)

cat("F1 Score: ", f1_score, "\n")


# calculate the NMI metric
if (!requireNamespace("aricode", quietly = TRUE)) {
  install.packages("aricode")
}
library(aricode)
# Calculate NMI
nmi <- NMI(zoo_data$type, as.integer(clustering_data$cluster))

# Print the NMI value
cat("Normalized Mutual Information (NMI): ", nmi, "\n")





# Compare two different clusterings
true_labels <- as.factor(zoo_data$type)          # Ground truth
clustering_data = clustering_data[,-c(17,18)]
kmeans_result <- kmeans(clustering_data, centers = 9)
# Add cluster assignments to the data (if needed)
clustering_data$cluster <- kmeans_result$cluster
predicted_labels_A <- as.factor(clustering_data$cluster)  # Clustering result A
kmeans_result <- kmeans(clustering_data, centers = 6)
# Add cluster assignments to the data (if needed)
clustering_data$cluster <- kmeans_result$cluster
predicted_labels_B <- as.factor(clustering_data$cluster)  # Clustering result B

# Calculate NMI between clustering result A and true labels
nmi_A <- NMI(as.integer(predicted_labels_A),zoo_data$type)

# Calculate NMI between clustering result B and true labels
nmi_B <- NMI(as.integer(predicted_labels_B), zoo_data$type)

# Calculate NMI between clustering result A and B
nmi_A_B <- NMI(as.integer(predicted_labels_A), as.integer(predicted_labels_B))

cat("NMI between Clustering A and True Labels: ", nmi_A, "\n")
cat("NMI between Clustering B and True Labels: ", nmi_B, "\n")
cat("NMI between Clustering A and Clustering B: ", nmi_A_B, "\n")