# This code is written for the project work for BAISI5 (computing for data Science)
# at Indian Statistical Instittue under PGDBA course. The code is written by Rachit Tripathi,
# Keshav Sehgal, Bodhisattwa Prasad Majumder and Dattatreya Biswas
#----------------------------------------------------------------------------------------------
# including necessary libraries
library(jsonlite)
library(tm)
library(data.table)
library(Matrix)
library(caret)
library(SnowballC)
library(xgboost)

library(Ckmeans.1d.dp)

library('e1071');
library('SparseM');
library('tm');
library('party')

#- Make sure to download the files and rename them and set them in current working directory
# before running this code run cleaning_data1_final.py and cleaning_data2_final.py 
train_raw  <- fromJSON("clean_train.json", flatten = TRUE)
submit_raw <- fromJSON("clean_test.json", flatten = TRUE)

#- pre-process the ingredients (basic)
train_raw$ingredients <- lapply(train_raw$ingredients, FUN=tolower)
train_raw$ingredients <- lapply(train_raw$ingredients, FUN=function(x) gsub("-", "_", x)) # allow dash e.g. "low-fat"
train_raw$ingredients <- lapply(train_raw$ingredients, FUN=function(x) gsub("[^a-z0-9_ ]", "", x)) # allow regular character and spaces

submit_raw$ingredients <- lapply(submit_raw$ingredients, FUN=tolower)
submit_raw$ingredients <- lapply(submit_raw$ingredients, FUN=function(x) gsub("-", "_", x)) # allow dash e.g. "low-fat"
submit_raw$ingredients <- lapply(submit_raw$ingredients, FUN=function(x) gsub("[^a-z0-9_ ]", "", x)) # allow regular character and spaces

#- create a matrix of ingredients in both the TRAIN and SUBMIT set
c_train_ingredients <- Corpus(VectorSource(train_raw$ingredients))
c_train_ingredients <- tm_map(c_train_ingredients,stripWhitespace)
c_test_ingredients <- Corpus(VectorSource(submit_raw$ingredients))
c_test_ingredients <- tm_map(c_test_ingredients,stripWhitespace)

c_train_ingredientsDTM <- DocumentTermMatrix(c_train_ingredients)# creating document term matrix for training data
remove(c_train_ingredients)
c_train_ingredientsDTM <- as.data.frame(as.matrix(c_train_ingredientsDTM))

c_test_ingredientsDTM <- DocumentTermMatrix(c_test_ingredients)# creating document erm amtrix for test data

c_test_ingredientsDTM <- as.data.frame(as.matrix(c_test_ingredientsDTM))
#######################################################################################################
######### This part of code is for creating a probability matrix fo naye base classifier ############## 
CuisineNames<-unique(train_raw$cuisine)
ct<-pmatch(train_raw$cuisine[1],CuisineNames)
col_1<-1
rowsum_train<- rowSums(c_train_ingredientsDTM)
for( i in 1:39774){
  ct[i]<-pmatch(train_raw$cuisine[i],CuisineNames)
  col_1[i]<-1
}

Train_cuisine_DTM<-cbind(ct,col_1,c_train_ingredientsDTM)
Bayes_Prob_Matrix<- t(colSums(Train_cuisine_DTM[Train_cuisine_DTM$ct==1,-1]))
for (i in 2:20){
  Bayes_Prob_Matrix<- rbind(Bayes_Prob_Matrix,t(colSums(Train_cuisine_DTM[Train_cuisine_DTM$ct==i,-1])))
  
} # This for loop classifies traing data according cuisine and take cloumn sums for each category.
Sum_cuisine<- Bayes_Prob_Matrix[,1]
Prob_cuisine<-Sum_cuisine/sum(Sum_cuisine)    # Calculating the probabilty of particular cusine 
Bayes_Prob_Matrix <- Bayes_Prob_Matrix[,-1]
ing_total_sum=colSums(Bayes_Prob_Matrix)

cuisine_ing_sum=rowSums(Bayes_Prob_Matrix)
Bayes_sum_Matrix<-Bayes_Prob_Matrix
Bayes_Prob_Matrix<-t(t(Bayes_sum_Matrix)/ing_total_sum)# making the matrix column stochastic
############# Naive Bayes probability matrix created###################################################

############# cleaning Test Data(removing dict terms which are not in the training data) ##############
both_interesect<- intersect(names(c_test_ingredientsDTM),names(c_train_ingredientsDTM))
c_test_ingredientsDTM<-c_test_ingredientsDTM[,both_interesect]
############# Predicting for test data using multiplication ###########################################
ing_considered<-train_raw$ingredients[[1]]
Bayes_Predict_Matrix<- Bayes_Prob_Matrix[,ing_considered]
cuisine_probability <-prod(Bayes_Predict_Matrix[1,])
# Finding the cusisine for first test data.
for( j in 1:20){
  cuisine_probability[j]<-prod(Bayes_Predict_Matrix[j,])*Prob_cuisine[j]
}# This is similar to applying Bayes theorem with assuming independence of ingradients
t<-which.max(cuisine_probability)# this is similar to finding argmax for which probability is maximum

for( i in 1: 9944)
{
  ing_cons<-submit_raw$ingredients[[i]]
  ing_considered<-setdiff(ing_cons,setdiff(ing_cons,names(as.data.frame(Baes_Prob_Matrix))))
  Baes_Predict_Matrix<- as.matrix(Baes_Prob_Matrix[,ing_considered])
  cuisine_probability <-prod(Baes_Predict_Matrix[1,])
  non_zero_num=0
  for( j in 1:20){
    cuisine_probability[j]<-prod(Baes_Predict_Matrix[j, ])*Prob_cuisine[j]
  }# This is finding probabilty of dish belonging to particular cuisine given the indpendence of ingradients
  t[i]<-which.max(cuisine_probability)# similar to finding argmax for cusine with max probabilty 
}
# Writing the data to file.
submission<- CuisineNames[t]
submission_final <- data.frame(cbind(submit_raw$id,submission))
write.csv(submission_final, file = 'Submission_GM_cutoff10.csv', row.names=F, quote=F)