# This code is written for the project work for BAISI5 (computing for data Science) # at Indian Statistical Instittue under PGDBA course. The code is written by Rachit Tripathi, # Keshav Sehgal, Bodhisattwa Prasad Majumder and Dattatreya Biswas #---------------------------------------------------------------------------------------------- # including necessary libraries library(jsonlite) library(tm) library(data.table) library(Matrix) library(caret) library(SnowballC) library(xgboost) library(Ckmeans.1d.dp) library('e1071'); library('SparseM'); library('tm'); library('party') #- Make sure to download the files and rename them and set them in current working directory # before running this code run cleaning_data1_final.py and cleaning_data2_final.py train_raw <- fromJSON("clean_train.json", flatten = TRUE) submit_raw <- fromJSON("clean_test.json", flatten = TRUE) #- pre-process the ingredients (basic) train_raw$ingredients <- lapply(train_raw$ingredients, FUN=tolower) train_raw$ingredients <- lapply(train_raw$ingredients, FUN=function(x) gsub("-", "_", x)) # allow dash e.g. "low-fat" train_raw$ingredients <- lapply(train_raw$ingredients, FUN=function(x) gsub("[^a-z0-9_ ]", "", x)) # allow regular character and spaces submit_raw$ingredients <- lapply(submit_raw$ingredients, FUN=tolower) submit_raw$ingredients <- lapply(submit_raw$ingredients, FUN=function(x) gsub("-", "_", x)) # allow dash e.g. "low-fat" submit_raw$ingredients <- lapply(submit_raw$ingredients, FUN=function(x) gsub("[^a-z0-9_ ]", "", x)) # allow regular character and spaces #- create a matrix of ingredients in both the TRAIN and SUBMIT set c_train_ingredients <- Corpus(VectorSource(train_raw$ingredients)) c_train_ingredients <- tm_map(c_train_ingredients,stripWhitespace) c_test_ingredients <- Corpus(VectorSource(submit_raw$ingredients)) c_test_ingredients <- tm_map(c_test_ingredients,stripWhitespace) c_train_ingredientsDTM <- DocumentTermMatrix(c_train_ingredients)# creating document term matrix for training data remove(c_train_ingredients) c_train_ingredientsDTM <- as.data.frame(as.matrix(c_train_ingredientsDTM)) c_test_ingredientsDTM <- DocumentTermMatrix(c_test_ingredients)# creating document erm amtrix for test data c_test_ingredientsDTM <- as.data.frame(as.matrix(c_test_ingredientsDTM)) ####################################################################################################### ######### This part of code is for creating a probability matrix fo naye base classifier ############## CuisineNames<-unique(train_raw$cuisine) ct<-pmatch(train_raw$cuisine[1],CuisineNames) col_1<-1 rowsum_train<- rowSums(c_train_ingredientsDTM) for( i in 1:39774){ ct[i]<-pmatch(train_raw$cuisine[i],CuisineNames) col_1[i]<-1 } Train_cuisine_DTM<-cbind(ct,col_1,c_train_ingredientsDTM) Bayes_Prob_Matrix<- t(colSums(Train_cuisine_DTM[Train_cuisine_DTM$ct==1,-1])) for (i in 2:20){ Bayes_Prob_Matrix<- rbind(Bayes_Prob_Matrix,t(colSums(Train_cuisine_DTM[Train_cuisine_DTM$ct==i,-1]))) } # This for loop classifies traing data according cuisine and take cloumn sums for each category. Sum_cuisine<- Bayes_Prob_Matrix[,1] Prob_cuisine<-Sum_cuisine/sum(Sum_cuisine) # Calculating the probabilty of particular cusine Bayes_Prob_Matrix <- Bayes_Prob_Matrix[,-1] ing_total_sum=colSums(Bayes_Prob_Matrix) cuisine_ing_sum=rowSums(Bayes_Prob_Matrix) Bayes_sum_Matrix<-Bayes_Prob_Matrix Bayes_Prob_Matrix<-t(t(Bayes_sum_Matrix)/ing_total_sum)# making the matrix column stochastic ############# Naive Bayes probability matrix created################################################### ############# cleaning Test Data(removing dict terms which are not in the training data) ############## both_interesect<- intersect(names(c_test_ingredientsDTM),names(c_train_ingredientsDTM)) c_test_ingredientsDTM<-c_test_ingredientsDTM[,both_interesect] ############# Predicting for test data using multiplication ########################################### ing_considered<-train_raw$ingredients[[1]] Bayes_Predict_Matrix<- Bayes_Prob_Matrix[,ing_considered] cuisine_probability <-prod(Bayes_Predict_Matrix[1,]) # Finding the cusisine for first test data. for( j in 1:20){ cuisine_probability[j]<-prod(Bayes_Predict_Matrix[j,])*Prob_cuisine[j] }# This is similar to applying Bayes theorem with assuming independence of ingradients t<-which.max(cuisine_probability)# this is similar to finding argmax for which probability is maximum for( i in 1: 9944) { ing_cons<-submit_raw$ingredients[[i]] ing_considered<-setdiff(ing_cons,setdiff(ing_cons,names(as.data.frame(Baes_Prob_Matrix)))) Baes_Predict_Matrix<- as.matrix(Baes_Prob_Matrix[,ing_considered]) cuisine_probability <-prod(Baes_Predict_Matrix[1,]) non_zero_num=0 for( j in 1:20){ cuisine_probability[j]<-prod(Baes_Predict_Matrix[j, ])*Prob_cuisine[j] }# This is finding probabilty of dish belonging to particular cuisine given the indpendence of ingradients t[i]<-which.max(cuisine_probability)# similar to finding argmax for cusine with max probabilty } # Writing the data to file. submission<- CuisineNames[t] submission_final <- data.frame(cbind(submit_raw$id,submission)) write.csv(submission_final, file = 'Submission_GM_cutoff10.csv', row.names=F, quote=F)