# This code is part of the course project tiled, 'What's Cooking # for 'Computing for Data Sciences' course carried out in PGDBA program @ ISI, Kolkata # during fall 2015, under the supervision of Sourav Sen Gupta (sg.[firstname]@gmail.com). # Members: Bodhisattwa Prasad Majumder (PGDBA 11) # Dattatreya Biswas (PGDBA 13) # Keshav Sehgal (PGDBA 20) # Rachit Tripathi (PGDBA 33) # references for the code: https://www.kaggle.com/yilihome/whats-cooking/simple-xgboost-in-r/code #- load packages library(jsonlite) library(tm) library(data.table) library(Matrix) library(caret) library(SnowballC) library(xgboost) library(Ckmeans.1d.dp) #- load data files Dish<- read.csv(file="Indian cuisine data.csv",head=TRUE,sep=",") # Convert all the words to lowercase Dish$Ingredients <- lapply(Dish$Ingredients, FUN=tolower) # Corpus creation c1_ingredients <- Corpus(VectorSource(Dish$Ingredients)) #- create simple DTM c1_ingredientsDTM <- DocumentTermMatrix(c1_ingredients) c1_ingredientsDTM <- as.data.frame(as.matrix(c1_ingredientsDTM)) # Random no generator to partition the data in 1:5 for test:train n<- 200 rownum<- sample(nrow(Dish),n) # Feature addition 1: No. of ingredients in the dish c1_ingredientsDTM$ingredients_count <- rowSums(c1_ingredientsDTM) # Feature addition 2 (optional) : No. of spices in the dish #train_spices <- 0 #for (i in 1:nrow(Dish)){ # train_spices[i] <- sum(str_count(Dish$Ingredients[i], c("alkanet root","ani","asafoetida","red chilli","black cardamom","white pepper","black pepper","peppercorns","black cumin","capers","capsicum","celery","charoli","bay leaf","cinnamon buds","cinnamon","citric acid","cloves","coriander powder","coriander ","cubeb","cumin ","fennel ","fenugreek leaf","fenugreek leaf","fenugreek ","four ","garcinia gummi-gutta","garam masala","garlic","ginger","dried ginger","green cardamom","indian bedellium tree","indian gooseberry","black salt","kalpasi","licorice powder","long pepper","mango extract","mint","mustard ","brown mustard ","nigella ","nutmeg","mace","holy basil","panch phoron","pomegranate ","poppy ","saffron pulp","saffron","salt","sesame ","star anise","tamarind","turmeric","fresh basil","fresh coriander","green chili pepper","gum tragacanth","red chili pepper","fennel ","asafoetida","basil ","bay leaf","big mustard ","black cardamom","black cumin ","black mustard ","black pepper","black salt","caraway ","carom","thymol","oregano","bishops weed","chilli powder","cinnamon","clarified butter","cloves","coconut desiccated","coconut","coconut fresh","coriander powder","coriander ","cilantro","coriander ","cumin ","curry "," fenugreek "," ginger"," mango powder","fenugreek ","fenugreek","garlic","fresh ginger"," ginger","green cardamom","green chilli","ground nut, peanuts","holy basil","jaggery","palm jaggery","lime","cocum, kokum","lime peel","mace","mint","nutmeg","onion","nigella","pine nuts","pomegranate ","poppy ","raisins","red chilli","rock salt","rock flower","stone flower","salt","sesame ","saffron","pepper","szechwan pepper","star anise","sultanas","tamarind","turmeric","vetiver","vinegar"))) #} #train_spices <- sapply(Dish$ingredients, FUN = function(x) sum(str_count(x,c("milk","coconut")))) #train_spices<- sapply(Dish$ingredients, FUN = function(x) sum(str_count(x,c("alkanet root","ani","asafoetida","red chilli","black cardamom","white pepper","black pepper","peppercorns","black cumin","capers","capsicum","celery","charoli","bay leaf","cinnamon buds","cinnamon","citric acid","cloves","coriander powder","coriander ","cubeb","cumin ","fennel ","fenugreek leaf","fenugreek leaf","fenugreek ","four ","garcinia gummi-gutta","garam masala","garlic","ginger","dried ginger","green cardamom","indian bedellium tree","indian gooseberry","black salt","kalpasi","licorice powder","long pepper","mango extract","mint","mustard ","brown mustard ","nigella ","nutmeg","mace","holy basil","panch phoron","pomegranate ","poppy ","saffron pulp","saffron","salt","sesame ","star anise","tamarind","turmeric","fresh basil","fresh coriander","green chili pepper","gum tragacanth","red chili pepper","fennel ","asafoetida","basil ","bay leaf","big mustard ","black cardamom","black cumin ","black mustard ","black pepper","black salt","caraway ","carom","thymol","oregano","bishops weed","chilli powder","cinnamon","clarified butter","cloves","coconut desiccated","coconut","coconut fresh","coriander powder","coriander ","cilantro","coriander ","cumin ","curry "," fenugreek "," ginger"," mango powder","fenugreek ","fenugreek","garlic","fresh ginger"," ginger","green cardamom","green chilli","ground nut, peanuts","holy basil","jaggery","palm jaggery","lime","cocum, kokum","lime peel","mace","mint","nutmeg","onion","nigella","pine nuts","pomegranate ","poppy ","raisins","red chilli","rock salt","rock flower","stone flower","salt","sesame ","saffron","pepper","szechwan pepper","star anise","sultanas","tamarind","turmeric","vetiver","vinegar") #))) #c1_ingredientsDTM$spices_count <- train_spices c1_ingredientsDTM$cuisine <- as.factor(Dish$State) # Partitioning the dataset into train and test (named submit) dtm_train <- c1_ingredientsDTM[-rownum, ] dtm_submit <- c1_ingredientsDTM[rownum, ] # Set the initial prediction for the cuisine as "North India", as it has the highest frequency dtm_submit$cuisine<- rep("North India", n) # Model Creation: XGBoost xgbmat <- xgb.DMatrix(Matrix(data.matrix(dtm_train[, !colnames(dtm_train) %in% c("cuisine")])), label=as.numeric(dtm_train$cuisine)-1) xgb <- xgboost(xgbmat, max.depth = 25, eta = 0.1, nround = 200, objective = "multi:softmax", num_class = 5) #- predict on the test set and change cuisine back to string xgb.submit <- predict(xgb, newdata = data.matrix(dtm_submit[, !colnames(dtm_submit) %in% c("cuisine")])) xgb.submit.text <- levels(dtm_train$cuisine)[xgb.submit+1] final <- as.data.frame(xgb.submit.text) # Accuracy test final$actual <- Dish$State[rownum] final$match<- c(0) for (i in 1:n) { if (final[i,1]==final[i,2]){ final[i,3]=1 } } Accuracy<- sum(final$match,na.rm=TRUE) /n Accuracy # importance martix and plot for top 20 features importance_matrix <- xgb.importance(names, model = xgb) xgb.plot.importance(importance_matrix[1:20,])