#result=na.omit(dd) # Assuming your dataset is named 'zoo_data' # and the type column is named 'type' or similar zoo_data <- read.csv("zoo.csv", header = TRUE, sep = ",") head(zoo_data) # Remove or exclude the 'type' column clustering_data <- zoo_data[, -which(names(zoo_data) == "type")] # Remove or exclude the 'type' column clustering_data <- clustering_data[, -which(names(clustering_data) == "name")] clustering_data <- clustering_data[, -which(names(clustering_data) == "legs")] clustering_data[] <- lapply(clustering_data, function(x) ifelse(x == "yes", 1, 0)) clustering_data$legs <- zoo_data$legs # Perform k-means clustering (for example, with 3 clusters) set.seed(123) kmeans_result <- kmeans(clustering_data, centers = 7) # Add cluster assignments to the data (if needed) clustering_data$cluster <- kmeans_result$cluster # check with the type columnexists in your original data table(zoo_data$type, clustering_data$cluster) # order to improve the looking contingency_table = table(zoo_data$type, clustering_data$cluster) contingency_table[c("insect","invertebrate","bird","amphibian","fish","reptile","mammal"),] # You can then visualize the clusters, for example with PCA library(ggplot2) pca <- prcomp(clustering_data, scale. = TRUE) pca_df <- data.frame(pca$x, cluster = factor(clustering_data$cluster)) ggplot(pca_df, aes(PC1, PC2, color = cluster)) + geom_point(size = 2) + labs(title = "Clustering of Zoo Data (PCA view)") # add the true type back on the produced dataframe clustering_data$true_label <- zoo_data$type # Calculate contingency table contingency_table <- table(zoo_data$type, clustering_data$cluster) # Compute purity purity <- sum(apply(contingency_table, 2, max)) / sum(contingency_table) # Print purity cat("Purity: ", purity, "\n") # Calculate F1-score # Initialize variables for precision and recall precision <- numeric(ncol(contingency_table)) recall <- numeric(ncol(contingency_table)) # Calculate precision and recall for each cluster for (i in 1:ncol(contingency_table)) { tp <- max(contingency_table[, i]) # True positives fp <- sum(contingency_table[, i]) - tp # False positives fn <- sum(contingency_table[ ,colSums(contingency_table) != 0]) - tp # False negatives precision[i] <- tp / (tp + fp) # Precision recall[i] <- tp / (tp + fn) # Recall } # Average precision and recall avg_precision <- mean(precision, na.rm = TRUE) avg_recall <- mean(recall, na.rm = TRUE) # Calculate F1-score f1_score <- 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall) cat("F1 Score: ", f1_score, "\n") # calculate the NMI metric if (!requireNamespace("aricode", quietly = TRUE)) { install.packages("aricode") } library(aricode) # Calculate NMI nmi <- NMI(zoo_data$type, as.integer(clustering_data$cluster)) # Print the NMI value cat("Normalized Mutual Information (NMI): ", nmi, "\n") # Compare two different clusterings true_labels <- as.factor(zoo_data$type) # Ground truth clustering_data = clustering_data[,-c(17,18)] kmeans_result <- kmeans(clustering_data, centers = 9) # Add cluster assignments to the data (if needed) clustering_data$cluster <- kmeans_result$cluster predicted_labels_A <- as.factor(clustering_data$cluster) # Clustering result A kmeans_result <- kmeans(clustering_data, centers = 6) # Add cluster assignments to the data (if needed) clustering_data$cluster <- kmeans_result$cluster predicted_labels_B <- as.factor(clustering_data$cluster) # Clustering result B # Calculate NMI between clustering result A and true labels nmi_A <- NMI(as.integer(predicted_labels_A),zoo_data$type) # Calculate NMI between clustering result B and true labels nmi_B <- NMI(as.integer(predicted_labels_B), zoo_data$type) # Calculate NMI between clustering result A and B nmi_A_B <- NMI(as.integer(predicted_labels_A), as.integer(predicted_labels_B)) cat("NMI between Clustering A and True Labels: ", nmi_A, "\n") cat("NMI between Clustering B and True Labels: ", nmi_B, "\n") cat("NMI between Clustering A and Clustering B: ", nmi_A_B, "\n")