#result=na.omit(dd)

# Assuming your dataset is named 'zoo_data'
# and the type column is named 'type' or similar
zoo_data <- read.csv("zoo.csv",
                 header = TRUE, sep = ",")

head(zoo_data)

# Remove or exclude the 'type' column
clustering_data <- zoo_data[, -which(names(zoo_data) == "type")]
# Remove or exclude the 'type' column
clustering_data <- clustering_data[, -which(names(clustering_data) == "name")]
clustering_data <- clustering_data[, -which(names(clustering_data) == "legs")]
clustering_data[] <- lapply(clustering_data, function(x) ifelse(x == "yes", 1, 0))
clustering_data$legs <- zoo_data$legs

# Perform k-means clustering (for example, with 3 clusters)
set.seed(123)
kmeans_result <- kmeans(clustering_data, centers = 7)

# Add cluster assignments to the data (if needed)
clustering_data$cluster <- kmeans_result$cluster

# check with the type columnexists in your original data
table(zoo_data$type, clustering_data$cluster)

# order to improve the looking
contingency_table = table(zoo_data$type, clustering_data$cluster)
contingency_table[c("insect","invertebrate","bird","amphibian","fish","reptile","mammal"),]

# You can then visualize the clusters, for example with PCA
library(ggplot2)
pca <- prcomp(clustering_data, scale. = TRUE)
pca_df <- data.frame(pca$x, cluster = factor(clustering_data$cluster))

ggplot(pca_df, aes(PC1, PC2, color = cluster)) +
  geom_point(size = 2) +
  labs(title = "Clustering of Zoo Data (PCA view)")

# add the true type back on the produced dataframe
clustering_data$true_label <- zoo_data$type