# This code is part of the course project tiled, 'What's Cooking # for 'Computing for Data Sciences' course carried out in PGDBA program @ ISI, Kolkata # during fall 2015, under the supervision of Sourav Sen Gupta (sg.[firstname]@gmail.com). # Members: Bodhisattwa Prasad Majumder (PGDBA 11) # Dattatreya Biswas (PGDBA 13) # Keshav Sehgal (PGDBA 20) # Rachit Tripathi (PGDBA 33) # references for the code: https://www.kaggle.com/yilihome/whats-cooking/simple-xgboost-in-r/code #- loading packages library(jsonlite) library(tm) library(data.table) library(Matrix) library(caret) library(SnowballC) library(xgboost) library(Ckmeans.1d.dp) #- load data files and flatten train_raw <- fromJSON("train.json", flatten = TRUE) submit_raw <- fromJSON("test.json", flatten = TRUE) #- pre-process the ingredients (basic) train_raw$ingredients <- lapply(train_raw$ingredients, FUN=tolower) train_raw$ingredients <- lapply(train_raw$ingredients, FUN=function(x) gsub("-", "_", x)) # allow dash e.g. "low-fat" train_raw$ingredients <- lapply(train_raw$ingredients, FUN=function(x) gsub("[^a-z0-9_ ]", "", x)) # allow regular character and spaces #(Optional) onlyiif you want to include longest ingredient as a feature # train_raw$ingredients <- lapply(train_raw$ingredients, FUN=function(x) gsub("[^a-zA-Z ]", ",", x)) submit_raw$ingredients <- lapply(submit_raw$ingredients, FUN=tolower) submit_raw$ingredients <- lapply(submit_raw$ingredients, FUN=function(x) gsub("-", "_", x)) # allow dash e.g. "low-fat" submit_raw$ingredients <- lapply(submit_raw$ingredients, FUN=function(x) gsub("[^a-z0-9_ ]", "", x)) # allow regular character and spaces #(Optional) onlyiif you want to include longest ingredient as a feature # submit_raw$ingredients <- lapply(submit_raw$ingredients, FUN=function(x) gsub("[^a-zA-Z ]", ",", x)) #pre process the ingredients advanced # 1. (Word Stemming) train_raw$ingredients<- wordStem(train_raw$ingredients, language = "english") # 2. longest ingredient (optional feature) train_raw$longest<- c(0,rep(nrow(train_raw))) for (i in 1:nrow(train_raw)){ shadvlength <- lapply(train_raw$ingredients[i],nchar) train_raw$longest[i] <- lapply(shadvlength, function(x) x[which.max(abs(x))]) } submit_raw$longest<- c(0,rep(nrow(submit_raw))) for (i in 1:nrow(submit_raw)){ shadvlength <- lapply(submit_raw$ingredients[i],nchar) submit_raw$longest[i] <- lapply(shadvlength, function(x) x[which.max(abs(x))]) } # String dist (String Distance, code in progress- to be developed) #lv <- stringdistmatrix(submit_raw$ingredients, submit_raw$ingredients, method="lv") #lv_match <- which(lv==1, arr.ind=TRUE) #for(i in seq_along(lv_match)/2) { print(ingredients_unique[c(lv_match[i,1], lv_match[i, 2])])} #- create a matrix of ingredients in both the TRAIN and SUBMIT set c_ingredients <- c(Corpus(VectorSource(train_raw$ingredients)), Corpus(VectorSource(submit_raw$ingredients))) #- create simple DTM c_ingredientsDTM <- DocumentTermMatrix(c_ingredients) c_ingredientsDTM <- removeSparseTerms(c_ingredientsDTM, 1-3/nrow(c_ingredientsDTM)) # remove if < 3 occurances c_ingredientsDTM <- as.data.frame(as.matrix(c_ingredientsDTM)) #converting to dataframe # (Optional) Remove common ingredients that might not be helpful in distinguishing the cuisine (e.g. Salt, water) # c_ingredientsDTM <- c_ingredientsDTM - c_ingredientsDTM$salt - c_ingredientsDTM$water # removing # (Optional) Create weighted DTM (e.g. using TF-IDF) # c_ingredientsDTM <- DocumentTermMatrix(c_ingredients, control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE))) #- Feature engineering (basic) c_ingredientsDTM$ingredients_count <- rowSums(c_ingredientsDTM) # simple count of ingredients per receipe #- add cuisine for TRAIN set, default to "italian" for the SUBMIT set(As Italian is occuring with maximum frequency i.e. 20%) c_ingredientsDTM$cuisine <- as.factor(c(train_raw$cuisine, rep("italian", nrow(submit_raw)))) # add length of the longest ingresient in the dish as a feature #c_ingredientsDTM$longest<- as.numeric(c(train_raw$longest, submit_raw$longest)) # split the DTM into TRAIN and SUBMIT sets dtm_train <- c_ingredientsDTM[1:nrow(train_raw), ] dtm_submit <- c_ingredientsDTM[-(1:nrow(train_raw)), ] # Model: xgboost # prepare the spare matrix (note: feature index in xgboost starts from 0) xgbmat <- xgb.DMatrix(Matrix(data.matrix(dtm_train[, !colnames(dtm_train) %in% c("cuisine")])), label=as.numeric(dtm_train$cuisine)-1) # train our multiclass classification model using "softmax" # set alpha =1 to set the objecctive function to L1 Norm. Similar for L2 norm (alpha=2) params <- list(alpha = 1) # odel xgb <- xgboost(xgbmat, max.depth = 25, eta = 0.3, nround = 10,objective = "multi:softmax", num_class = 20, params) #- (Optional) train our multiclass classification model using "Softprob". #params <- list(alpha = 1) #xgb <- xgboost(xgbmat, max.depth = 25, eta = 0.3, nround = 500, objective = "multi:softprob", num_class = 20, params) #- predict on the SUBMIT set and change cuisine back to string xgb.submit <- predict(xgb, newdata = data.matrix(dtm_submit[, !colnames(dtm_submit) %in% c("cuisine")])) xgb.submit.text <- levels(dtm_train$cuisine)[xgb.submit+1] # xgb pobabilities matrix for Multiclass:Softprob function #xgb_probabilities<- matrix(xgb.submit,200,byrow = TRUE) #- build and write the submission file submit_match <- cbind(as.data.frame(submit_raw$id), as.data.frame(xgb.submit.text)) colnames(submit_match) <- c("id", "cuisine") submit_match <- data.table(submit_match, key="id") submit_cuisine <- submit_match[id==sample_sub$id, as.matrix(cuisine)] submission <- data.frame(c(id = sample_sub$id, cuisine = submit_cuisine)) write.csv(submit_match, file = 'xgboost_multiclass.csv', row.names=F, quote=F) # importance martix and plot for top 20 features importance_matrix <- xgb.importance(names, model = xgb) xgb.plot.importance(importance_matrix[1:20,])