# This code is part of the course project tiled, 'What's Cooking
# for 'Computing for Data Sciences' course carried out in PGDBA program @ ISI, Kolkata 
# during fall 2015, under the supervision of Sourav Sen Gupta (sg.[firstname]@gmail.com).

# Members: Bodhisattwa Prasad Majumder (PGDBA 11)
#          Dattatreya Biswas (PGDBA 13)
#          Keshav Sehgal (PGDBA 20)
#          Rachit Tripathi (PGDBA 33)

# references for the code: https://www.kaggle.com/yilihome/whats-cooking/simple-xgboost-in-r/code

#- loading packages
library(jsonlite)
library(tm)
library(data.table)
library(Matrix)
library(caret)
library(SnowballC)
library(xgboost)
library(Ckmeans.1d.dp)

#- load data files and flatten
train_raw  <- fromJSON("train.json", flatten = TRUE)
submit_raw <- fromJSON("test.json", flatten = TRUE)

#- pre-process the ingredients (basic)
train_raw$ingredients <- lapply(train_raw$ingredients, FUN=tolower)
train_raw$ingredients <- lapply(train_raw$ingredients, FUN=function(x) gsub("-", "_", x)) # allow dash e.g. "low-fat"
train_raw$ingredients <- lapply(train_raw$ingredients, FUN=function(x) gsub("[^a-z0-9_ ]", "", x)) # allow regular character and spaces

#(Optional) onlyiif you want to include longest ingredient as a feature 
# train_raw$ingredients <- lapply(train_raw$ingredients, FUN=function(x) gsub("[^a-zA-Z ]", ",", x))

submit_raw$ingredients <- lapply(submit_raw$ingredients, FUN=tolower)
submit_raw$ingredients <- lapply(submit_raw$ingredients, FUN=function(x) gsub("-", "_", x)) # allow dash e.g. "low-fat"
submit_raw$ingredients <- lapply(submit_raw$ingredients, FUN=function(x) gsub("[^a-z0-9_ ]", "", x)) # allow regular character and spaces

#(Optional) onlyiif you want to include longest ingredient as a feature
# submit_raw$ingredients <- lapply(submit_raw$ingredients, FUN=function(x) gsub("[^a-zA-Z ]", ",", x))


#pre process the ingredients advanced 
# 1. (Word Stemming)
train_raw$ingredients<- wordStem(train_raw$ingredients, language = "english")

# 2. longest ingredient (optional feature)
train_raw$longest<- c(0,rep(nrow(train_raw)))

for (i in 1:nrow(train_raw)){
  shadvlength <- lapply(train_raw$ingredients[i],nchar)
  train_raw$longest[i] <- lapply(shadvlength, function(x) x[which.max(abs(x))])
}

submit_raw$longest<- c(0,rep(nrow(submit_raw)))

for (i in 1:nrow(submit_raw)){
  shadvlength <- lapply(submit_raw$ingredients[i],nchar)
  submit_raw$longest[i] <- lapply(shadvlength, function(x) x[which.max(abs(x))])
}

# String dist (String Distance, code in progress- to be developed)
#lv <- stringdistmatrix(submit_raw$ingredients, submit_raw$ingredients, method="lv") 
#lv_match <- which(lv==1, arr.ind=TRUE)
#for(i in seq_along(lv_match)/2) { print(ingredients_unique[c(lv_match[i,1], lv_match[i, 2])])}

#- create a matrix of ingredients in both the TRAIN and SUBMIT set
c_ingredients <- c(Corpus(VectorSource(train_raw$ingredients)), Corpus(VectorSource(submit_raw$ingredients)))

#- create simple DTM
c_ingredientsDTM <- DocumentTermMatrix(c_ingredients)
c_ingredientsDTM <- removeSparseTerms(c_ingredientsDTM, 1-3/nrow(c_ingredientsDTM)) # remove if < 3 occurances
c_ingredientsDTM <- as.data.frame(as.matrix(c_ingredientsDTM)) #converting to dataframe

# (Optional) Remove common ingredients that might not be helpful in distinguishing the cuisine (e.g. Salt, water)
# c_ingredientsDTM <- c_ingredientsDTM - c_ingredientsDTM$salt - c_ingredientsDTM$water # removing 

# (Optional) Create weighted DTM (e.g. using TF-IDF)
# c_ingredientsDTM <- DocumentTermMatrix(c_ingredients, control = list(weighting = function(x) weightTfIdf(x, normalize = FALSE))) 

#- Feature engineering (basic)
c_ingredientsDTM$ingredients_count  <- rowSums(c_ingredientsDTM) # simple count of ingredients per receipe

#- add cuisine for TRAIN set, default to "italian" for the SUBMIT set(As Italian is occuring with maximum frequency i.e. 20%)
c_ingredientsDTM$cuisine <- as.factor(c(train_raw$cuisine, rep("italian", nrow(submit_raw))))
# add length of the longest ingresient in the dish as a feature
#c_ingredientsDTM$longest<- as.numeric(c(train_raw$longest, submit_raw$longest))

# split the DTM into TRAIN and SUBMIT sets
dtm_train  <- c_ingredientsDTM[1:nrow(train_raw), ]
dtm_submit <- c_ingredientsDTM[-(1:nrow(train_raw)), ]

# Model: xgboost 
# prepare the spare matrix (note: feature index in xgboost starts from 0)
xgbmat     <- xgb.DMatrix(Matrix(data.matrix(dtm_train[, !colnames(dtm_train) %in% c("cuisine")])), label=as.numeric(dtm_train$cuisine)-1)

# train our multiclass classification model using "softmax"
# set alpha =1 to set the objecctive function to L1 Norm. Similar for L2 norm (alpha=2)
params <- list(alpha = 1)
# odel
xgb   <- xgboost(xgbmat, max.depth = 25, eta = 0.3, nround = 10,objective = "multi:softmax", num_class = 20, params)

#- (Optional) train our multiclass classification model using "Softprob".
#params <- list(alpha = 1)
#xgb    <- xgboost(xgbmat, max.depth = 25, eta = 0.3, nround = 500, objective = "multi:softprob", num_class = 20, params)

#- predict on the SUBMIT set and change cuisine back to string

xgb.submit      <- predict(xgb, newdata = data.matrix(dtm_submit[, !colnames(dtm_submit) %in% c("cuisine")]))
xgb.submit.text <- levels(dtm_train$cuisine)[xgb.submit+1]

# xgb pobabilities matrix for Multiclass:Softprob function
#xgb_probabilities<- matrix(xgb.submit,200,byrow = TRUE)

#- build and write the submission file
submit_match   <- cbind(as.data.frame(submit_raw$id), as.data.frame(xgb.submit.text))
colnames(submit_match) <- c("id", "cuisine")
submit_match   <- data.table(submit_match, key="id")
submit_cuisine <- submit_match[id==sample_sub$id, as.matrix(cuisine)]

submission <- data.frame(c(id = sample_sub$id, cuisine = submit_cuisine))
write.csv(submit_match, file = 'xgboost_multiclass.csv', row.names=F, quote=F)

# importance martix and plot for top 20 features
importance_matrix <- xgb.importance(names, model = xgb)
xgb.plot.importance(importance_matrix[1:20,])