This is sort of a whimsical use of machine learning algorithms, to classify the 500 largest Indian cities and use it to predict the location of other cities.

The CSV file cities_r2.csv comes from https://www.kaggle.com/zed9941/datasets, a Kaggle dataset of information on the 500 largest Indian cities.

The goal is to use a machine learning algorithm to predict the region – North, South, East, West, Northeast, or Central – of an Indian city.

Data cleaning and feature engineering

In order to not be skewed by city size, we want to remove data related to population, and couch all that information as rates. And then we bin all the cities into six different geographical regions, as laid out here: https://en.wikipedia.org/wiki/List_of_regions_of_India.

data <- read.csv("cities_r2.csv")
# feature engineering
data$young_pop_rate_total <- 100*data$X0.6_population_total/data$population_total
data$young_pop_rate_male <- 100*data$X0.6_population_male/data$population_male
data$young_pop_rate_female <- 100*data$X0.6_population_female/data$population_female
data$effective_grad_rate_total <- 100*data$total_graduates/(data$population_total-data$X0.6_population_total)
data$effective_grad_rate_male <- 100*data$male_graduates/(data$population_male-data$X0.6_population_male)
data$effective_grad_rate_female <- 100*data$female_graduates/(data$population_female-data$X0.6_population_female)

# subsetting
data <- data[,-c(2,4,5:13,19:22)]

# define regions
Central <- c("CHHATTISGARH", "MADHYA PRADESH")
East <- c("BIHAR", "JHARKHAND", "WEST BENGAL", "ORISSA",
          "ANDAMAN & NICOBAR ISLANDS")
North <- c("CHANDIGARH", "HARYANA", "HIMACHAL PRADESH",
           "JAMMU & KASHMIR", "PUNJAB", "RAJASTHAN",
           "UTTARAKHAND", "UTTAR PRADESH", "NCT OF DELHI")
Northeast <- c("ASSAM", "MANIPUR ", "MEGHALAYA","MIZORAM",
               "NAGALAND","TRIPURA")
South <- c("ANDHRA PRADESH", "KARNATAKA", "KERALA", 
           "TAMIL NADU", "PUDUCHERRY")
West <- c("GUJARAT", "MAHARASHTRA")

# assign regions
data$region <- "X"
for (i in 1:nrow(data)){
      state <- as.character(data[i,2])
      if (state %in% North)
            data[i,14] <- "North"
      else if (state %in% East)
            data[i,14] <- "East"
      else if (state %in% South)
            data[i,14] <- "South"
      else if (state %in% West)
            data[i,14] <- "West"
      else if (state %in% Northeast)
            data[i,14] <- "Northeast"
      else if (state %in% Central)
            data[i,14] <- "Central"
}
data$region <- as.factor(data$region)
names(data)
##  [1] "name_of_city"                   "state_name"                    
##  [3] "sex_ratio"                      "child_sex_ratio"               
##  [5] "effective_literacy_rate_total"  "effective_literacy_rate_male"  
##  [7] "effective_literacy_rate_female" "young_pop_rate_total"          
##  [9] "young_pop_rate_male"            "young_pop_rate_female"         
## [11] "effective_grad_rate_total"      "effective_grad_rate_male"      
## [13] "effective_grad_rate_female"     "region"

Finally, we remove the names of the city and the state.

d <- data
d$name_of_city <- NULL
d$state_name <- NULL

Machine learning models

Create the training and test subsets.

library(caret)
set.seed(134)
inTrain <- createDataPartition(d$region, p=0.80, list=FALSE)
training <- d[inTrain,]
test <- d[-inTrain,]

First I’ll do a simple classification tree.

fitTree <- train(region ~ ., data = training, method="rpart")
predTree <- predict(fitTree,test)
confMfitTree <- confusionMatrix(predTree,test$region)
confMfitTree$overall[1]
##  Accuracy 
## 0.5157895

Now let’s try a random forest instead.

library(randomForest)
fitRF <- randomForest(region ~ ., data=training, method="class")
#fitRF <- train(region~., data=training, method="rf",prox=TRUE)
predRF <- predict(fitRF,test)
confMfitRF <- confusionMatrix(predRF, test$region)
confMfitRF$overall[1]
##  Accuracy 
## 0.7684211

I could try a k-nearest neighbors.

fitKNN <- train(region~., data=training, method="knn")
predKNN <- predict(fitKNN,test)
confMfitKNN <- confusionMatrix(predKNN, test$region)
confMfitKNN$overall[1]
##  Accuracy 
## 0.6421053

Now what if I try some sort of boosting.

fitGBM <- train(region~., data=training, method="gbm",verbose=FALSE)
predGBM <- predict(fitGBM,test)
confMfitGBM <- confusionMatrix(predGBM,test$region)
confMfitGBM$overall[1]
##  Accuracy 
## 0.8421053
confMfitGBM$table
##            Reference
## Prediction  Central East North Northeast South West
##   Central         3    0     1         0     1    1
##   East            0   17     1         1     0    0
##   North           2    4    28         0     0    1
##   Northeast       0    0     0         0     0    0
##   South           2    0     0         0    20    0
##   West            1    0     0         0     0   12

Predictions

Here’s a cute prediciton. I can make a new “city” that is the average of all the other cities, and see what my models predict for it.

e <- d
e$region <- NULL
averageCity <- colMeans(e)
averageCity <- as.data.frame(averageCity)
averageCity
##                                averageCity
## sex_ratio                        930.29412
## child_sex_ratio                  902.33266
## effective_literacy_rate_total     85.13146
## effective_literacy_rate_male      89.92016
## effective_literacy_rate_female    79.96718
## young_pop_rate_total              10.73508
## young_pop_rate_male               10.88571
## young_pop_rate_female             10.58078
## effective_grad_rate_total         14.78626
## effective_grad_rate_male          16.75958
## effective_grad_rate_female        12.68991
predict(fitGBM,t(averageCity))
## [1] Central
## Levels: Central East North Northeast South West

A fun next step would be to find this data for some other city in another country, and see what it gets classified as. Maybe London belongs in South India, and Los Angeles in West India?