# This code is part of the course project tiled, 'Topic Transition Modeling'
# for 'Computing for Data Sciences' course carried out in PGDBA program @ ISI, Kolkata 
# during fall 2015, under the supervision of Sourav Sen Gupta (sg.[firstname]@gmail.com).

# Authors:
# Aashish Jhamtani
# Pradeep Mooda
# Somya Ranjan Sahu
# Shashank Kumar ([firstname].inquest[at][same mail server as above])

# loading the required libraries

library(twitteR)
library(tm)
library(wordcloud)
library(RColorBrewer)
library(topicmodels)
library(lda)
library(MASS)
library(openNLP)
library(NLP)
library(R.utils)
library(msm)
library(stringdist)
library(igraph)
library(stats)
library(utils)
library(devtools)
devtools::install_github("hadley/stringr")

################## processing input comments and creating a corpus #####################

# reading the scraped comments dumped in .csv format
f <- "/Users/shashank/Documents/PGDBA/ISI\ 1st\ Sem/Computing\ for\ Data\ Science/Project/comments_edited.csv"
df_tmp <- read.csv(f)

# reading the list of hindi words available
h_dictionary=read.csv("/Users/shashank/Documents/PGDBA/ISI\ 1st\ Sem/Computing\ for\ Data\ Science/Project/HindiDictionary.csv")
h_dictionary=as.character(h_dictionary[,1])

chunk <- 5   # no. of comments clubbed together to represent one timestamp
df_comments <- data.frame(comment=NA)
i <- 0

# initializing a spelling correction lookup table used for saving all corrections for
# future reference and learning
corrected_lookup <- matrix(data = c("guys","guy"),ncol = 2)
corrected_lookup <- read.table("/Users/shashank/Documents/PGDBA/ISI\ 1st\ Sem/Computing\ for\ Data\ Science/Project/corrected_lookup.csv")

# segregating group of comments in one line and spell-checking each word using native mycorrect function
while (chunk*(i+1) <= nrow(df_tmp)) {
  c <- paste(df_tmp[((chunk*i)+1):(chunk*(i+1)),1], collapse = " ")
  c <- gsub("[[:punct:]]", " ",c)
  words <- tolower(unlist(strsplit(c, "\\s+")))
  words <- words[!words %in% h_dictionary]
  corrected_words <- c()
  for (w in words) {
    if (is.na(match(w,sorted_words))) {
      if (is.na(match(w,corrected_lookup[,1]))) {
        rep <- mycorrect(w)
        corrected_words <- c(corrected_words,rep)
        if (!(rep==w)) {
          corrected_lookup <- rbind(corrected_lookup,cbind(w,rep))
        } 
      } else {
        corrected_words <- c(corrected_words,corrected_lookup[match(corrected_lookup[,1],w),1])
      }
      
    } else {
      corrected_words <- c(corrected_words,w)
    }
  }
  
  df_comments <- rbind(df_comments,paste0(corrected_words, collapse = " "))
  i <- i+1
}
df_comments <- as.data.frame(df_comments[-1,])

# writing out list of incorrect words and their corrected forms for future reference and learning
write.table(as.data.frame(corrected_lookup),file = "/Users/shashank/Documents/PGDBA/ISI\ 1st\ Sem/Computing\ for\ Data\ Science/Project/corrected_lookup.csv",col.names = NA)

# parts-of-speech tagging and collecting nouns
str(df_comments)
content=df_comments[,1]
class(content)
x <- matrix(data="",nrow = length(content), ncol = 2)
colnames(x)=c("Content","Nouns+verbs+Adjectives")

Maxent_Sent_Token_Annotator(language = "en", probs = FALSE, model = NULL)
sent_token_annotator <- Maxent_Sent_Token_Annotator()
word_token_annotator <- Maxent_Word_Token_Annotator()
pos_tag_annotator <- Maxent_POS_Tag_Annotator()
class(content)
for(j in 1:length(content)) {
  trim <- function (a) gsub("^\\s+|\\s+$", "", a)
  s=trim(content[j])  
  if(nchar(s)==0 || is.na(s)) {
    next 
  }
  s=as.character(s)
  s <- paste(c(s), collapse = "")
  s <- as.String(s)
  a2 <- annotate(s, list(sent_token_annotator, word_token_annotator))
  a3 <- annotate(s, pos_tag_annotator, a2)
  a3w <- subset(a3, type == "word")
  tags <- sapply(a3w$features, `[[`, "POS")
  class(a3w)
  class(a3w$features)
  a3wdf=as.data.frame(a3w)
  a4w=cbind(a3wdf,tags)
  i=1
  x[j,1]=as.character(data[j,1])
  for(i in 1:nrow(a4w)) {
    if(a4w[i,6]=="NN"|a4w[i,6]=="NNS"|a4w[i,6]=="NNP"|a4w[i,6]=="NNPS"){
      x[j,2]=paste((x[j,2]),(s[a4w[i,3],a4w[i,4]]),sep=" ")
    }
  }
}


# creating a corpus from the pre-processed data frame and further post-processing
mach_corpus = Corpus(VectorSource(x[,2]))
myCorpus <- tm_map(mach_corpus,content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')),
                   mc.cores=1)

removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
myCorpus <- tm_map(myCorpus, removeURL)
myCorpus <- tm_map(myCorpus, tolower)
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeNumbers)
myCorpus <- tm_map(myCorpus, PlainTextDocument)

# creating document term matrix
myDtm <- DocumentTermMatrix(myCorpus, control = list(wordLengths=c(2,Inf), weighting=weightTf))
#myDtm_smart <- DocumentTermMatrix(myCorpus, control = list(wordLengths=c(2,Inf), weighting= function(x) weightSMART(x,spec = "Lnn")))

# tfidf weighing in dtm
myDtm_tfidf <- DocumentTermMatrix(myCorpus, control = list(wordLengths=c(2,Inf), weighting=weightTfIdf))
myDtm$i <- myDtm_tfidf$i
myDtm$j <- myDtm_tfidf$j
myDtm$v <- as.integer(trunc((myDtm_tfidf$v)*10^(ceiling(log10(1/min((myDtm_tfidf$v)))))))

########################################################################################


######################## lda and epistemic mapping of topics ##########################

dtm <- myDtm
rowTotals <- apply(dtm, 1, sum)
dtm2 <- dtm[rowTotals>0,]
dim(dtm2)

# running lda with 10 topics
dtm_LDA <- LDA(dtm2, 10)
# getting 10 most relevant keywords for each topic, just for a quick look
get_terms(dtm_LDA, 10) 

# dictionary-topic mapping based on similarity of relevant words in the reference topics
# and topics found by applying lda over the corpus of comments

# loading the distribution of words for each reference topic already created
topics_path <- "/Users/shashank/Documents/PGDBA/ISI\ 1st\ Sem/Computing\ for\ Data\ Science/Project/Topics/"
filenames <- list.files(topics_path,pattern="*.csv")
f_path <- paste0(topics_path,filenames,sep="")
dic <- list()
for (i in 1:length(filenames)) {
  dic[[i]] <- as.matrix(read.csv(f_path[i],sep = ","))
  cum_prob <- 0
  crt_len <- 0
  for (j in 1:nrow(dic[[i]])) {
    cum_prob = cum_prob + as.numeric(dic[[i]][j,3])
    if (cum_prob>=0.95) {
      crt_len <- j
      break
    }
  }
  if (crt_len!=0) {
    dic[[i]] <- as.matrix(read.csv(f_path[i],sep = ","))[1:crt_len,2:3]
  } else {
    dic[[i]] <- as.matrix(read.csv(f_path[i],sep = ","))[,2:3]
  }
  dic[[i]][,2] <- matrix(as.numeric(dic[[i]][,2])/sum(as.numeric(dic[[i]][,2])),ncol = 1)
  
}

dic_len=nrow(dic[[1]])
lda_topic_words <- list()
dtm_LDA_beta <- exp(dtm_LDA@beta)
for (i in 1:nrow(dtm_LDA@beta)) {
  tmp_beta <- sort(dtm_LDA_beta[i,],decreasing = TRUE)
  cum_prob <- 0
  crt_len <- 0
  
  for (j in 1:dic_len) {
    cum_prob <- cum_prob+tmp_beta[j]
    if (cum_prob>0.95) {
      crt_len <- j
      break
    }
  }
  if (crt_len!=0) {
    lda_topic_words[[i]] <- t(as.matrix(rbind(get_terms(dtm_LDA,crt_len),tmp_beta[1:crt_len])))
  } else {
    lda_topic_words[[i]] <- as.matrix(cbind(as.vector(get_terms(dtm_LDA,dic_len)),matrix(tmp_beta[1:dic_len],ncol = 1)))
  }
  lda_topic_words[[i]][,2] <- matrix(as.numeric(lda_topic_words[[i]][,2])/sum(as.numeric(lda_topic_words[[i]][,2])),ncol = 1)
}

# calculating the similarity of topics in the corpus of comments with every dictionary 
# topic based on the dot product of probability values for same words in the topics
smlr <- matrix(data = NA,ncol = length(dic),nrow = dtm_LDA@k)
topic_dic_index <- matrix(data = NA,ncol = 1,nrow = dtm_LDA@k)
for (i in 1:dtm_LDA@k) {
  for (j in 1:length(dic)) {
    common_lexicon <- unique(c(lda_topic_words[[i]][,1],dic[[j]][,1]))
    s <- 0
    for (w in common_lexicon) {
      if (!(is.na(match(w,lda_topic_words[[i]][,1]))) & !(is.na(match(w,dic[[j]][,1])))) {
        s <- s + as.numeric(lda_topic_words[[i]][(match(w,lda_topic_words[[i]][,1])),2])*as.numeric(dic[[j]][(match(w,dic[[j]][,1])),2])
      }
    }
    smlr[i,j] <- s
  }
  topic_dic_index[i,1] <- gsub("+\\s.csv","",filenames[which.max(smlr[i,])])
}

#######################################################################################


#################### msm based markov chain modeling ##################################

# creating the batch of data required by msm for markov chain modeling
n_indv <- 1
time_factor <- 1
epoch <- length(dtm_LDA@documents)
time_frame <- as.integer(trunc(epoch*time_factor))
topic_set <- seq(1,dtm_LDA@k,1)
df_msm <- data.frame(Ints=integer(),
                     Ints=integer(),
                     Ints=integer())

for (i in 1:n_indv) {
  df_time <- t(t(sort(sample(seq(1,epoch,1),size = time_frame,replace = FALSE))))
  df_topic <- c()
  for (j in df_time) {
    t_prob <- dtm_LDA@gamma[j,]
    #   df_topic <- c(df_topic,sample(topic_set,size = 1,prob = t_prob))
    df_topic <- c(df_topic,which.max(t_prob))
  }
  df_topic <- t(t(df_topic))
  df_tmp <- cbind((vector(length = time_frame)+i),df_time,df_topic)
  df_msm <- rbind(df_msm,df_tmp)
}

colnames(df_msm) <- c("id","time","state")


# creating initial qmatrix and transition probability matrix as required by msm
qmatrix <- mat.or.vec(length(topic_set),length(topic_set))+1
init_tp <- statetable.msm(state, id, data=df_msm)
init_tp <- init_tp/rowSums(init_tp) 
crudeinits.msm(state ~ time, id, data=df_msm, qmatrix=qmatrix)

# running msm for fitting DTMC using 'bfgs' optimizer and other parameters to esure
# convergence while mainting accuracy of the maximum likihood estimates for the 
# transition probability parameters

topic.msm <- msm( formula = state ~ time, subject = id, data = df_msm,qmatrix = qmatrix,gen.inits = TRUE,
                  obstype = 2,exacttimes = TRUE,control = list(fnscale = 2500,
                                                               reltol = 1e-16,ndeps=rep(1e-6, 90)))


# splitting the chain in lengths of 100 time units to study the time in-homogeneous
# nature of the markov chain under time-series model
pmatrix_list=list()
topic.msm_list=list()
pointer=1
i=1
for (j in seq(100,nrow(df_msm),100)) {
  topic.msm <- msm( formula = state ~ time, subject = id, data = df_msm[pointer:j,],qmatrix = qmatrix,gen.inits = TRUE,
                    obstype = 2,exacttimes = TRUE,method = "CG")
  topic.msm_list[[i]] = topic.msm
  pmatrix_list[[i]] = as.matrix(pmatrix.msm(topic.msm))
  pointer=j+1
  i=i+1
}

ptrans_mat =matrix(data = 0,nrow = length(pmatrix_list),ncol = length(pmatrix_list[[1]]) )
for (i in (1:length(pmatrix_list))) {
  ptrans_mat[i,] = c(pmatrix_list[[i]])
}

# plotting the distribution of a few transition probability numbers over time
qplot(seq(1,64,1),ptrans_mat[,1],main="Distribution of p(1,1) over time", xlab="Time Window",ylab="p(1,1)")
qplot(seq(1,64,1),ptrans_mat[,10],main="Distribution of p(10,1) over time", xlab="Time Window",ylab="p(10,1)")
qplot(seq(1,64,1),ptrans_mat[,91],main="Distribution of p(1,10) over time", xlab="Time Window",ylab="p(1,10)")
qplot(seq(1,64,1),ptrans_mat[,100],main="Distribution of p(10,10) over time", xlab="Time Window",ylab="p(10,10)")

#######################################################################################


################## visualization in igraph ######################################################

test_nodes <- read.csv("G:\\PGDBA\\ISI\\Computing for Data Science\\CDS Project\\graphs\\nodes.csv", header=F, as.is=T)
test_links <- read.csv("G:\\PGDBA\\ISI\\Computing for Data Science\\CDS Project\\graphs\\links.csv", header=F, as.is=T)
transition <- read.csv("G:\\PGDBA\\ISI\\Computing for Data Science\\CDS Project\\graphs\\transition_matrix.csv", header=T, as.is=T)
transition <- transition[,2:length(transition)]
trans_mat <- as.matrix(transition)

trans=c()
for(i in 1:length(transition)){
  trans = append(trans,as.matrix(transition)[i,])
}

x_coor=c(2,2,4,4)
y_coor=c(1,4,1,4)
l=as.matrix(cbind(x_coor,y_coor))

state1=t(as.matrix(test_nodes[,2]))
step=10
docs=4
png(file="G:\\PGDBA\\ISI\\Computing for Data Science\\CDS Project\\graphs\\example%03d.png", width=1600,height=900)

for(i in 1:docs){  
  state2=state1%*%trans_mat
  for(j in 0:step){  
    temp_nodes=state1+(state2-state1)*j/step
    temp_nodes=t(temp_nodes)
    temp_nodes=as.data.frame(cbind(c("","T2","T3","T4"),temp_nodes))
    rownames(temp_nodes)=NULL
    temp_nodes$V2=as.numeric(levels(temp_nodes$V2))[temp_nodes$V2]
    test_net <- graph.data.frame(test_links, temp_nodes, directed=T)
    #test_net
    #V(test_net)$name
    #V(test_net)$V2
    #to remove links to self
    #plot(test_net)
    #plot(test_net, edge.arrow.size=.4)
    
    
    #colrs <- c("Red", "Green", "Blue", "Yellow")
    
    color=c(rgb((1-round(100*V(test_net)$V2[1])/100),(1-round(100*V(test_net)$V2[1])/100),(round(100*V(test_net)$V2[1])/100)),
            rgb((1-round(100*V(test_net)$V2[2])/100),(1-round(100*V(test_net)$V2[2])/100),(round(100*V(test_net)$V2[2])/100)),
            rgb((1-round(100*V(test_net)$V2[3])/100),(1-round(100*V(test_net)$V2[3])/100),(round(100*V(test_net)$V2[3])/100)),
            rgb((1-round(100*V(test_net)$V2[4])/100),(1-round(100*V(test_net)$V2[4])/100),(round(100*V(test_net)$V2[4])/100)))
    
    
    # plot(test_net,vertex.size=200*V(test_net)$V2,edge.arrow.size=.6,layout=l,vertex.color=color,
    #     vertex.label=as.character(V(test_net)$name),main=paste("time=",as.character(i+j/step)))
    
    test_net <- simplify(test_net, remove.multiple = F, remove.loops = T)
    
    plot(test_net,vertex.size=200*V(test_net)$V2,vertex.label=as.character(V(test_net)$name),
         edge.arrow.size=.6,layout=l,vertex.color=color,main=paste("time=",as.character(i+j/step)))
  }
  state1=state2
  rm(temp_nodes)
}

########################################################################################


####################### native modules #################################################

# java based crawler for web scraping

package myGov;

import java.io.*;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;

import javax.net.ssl.HttpsURLConnection;
import javax.swing.text.Document;
import javax.swing.text.EditorKit;
import javax.swing.text.SimpleAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;

import jxl.Workbook;
import jxl.write.Label;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;

import org.jsoup.Jsoup;
import org.jsoup.select.Elements;

public class Scrap {
  public static void main(String[] args) throws Exception, IOException {
    int file_count = 83;
    BufferedWriter bw1 = new BufferedWriter(new FileWriter(
      "G:/EclipseWorkspace/scrapping/2_Bhopal_output.csv"));
    int v = 4383;
    int count = 0;
    BufferedWriter bw = new BufferedWriter(new FileWriter(
      "G:/EclipseWorkspace/scrapping/Bhopal_output_" + file_count
      + ".csv"));
    do {
      // define baseurl here
      String govbase = "https://mygov.in/group-issue/smart-city-bhopal/?field_hashtags_tid=&sort_by=created&sort_order=DESC&page=0%2C";
      String govlink = govbase + v;
      System.out.println(v);
      // / the below code will take all links from discussions page
      
      String govres = getUrlSource(govlink);
      // System.out.println(govres);
      org.jsoup.nodes.Document doc = Jsoup.parse(govres, "UTF-8");
      Elements discussionlink = doc.select("div.comment_body");
      // String f1 = discussionlink.toString();
      // System.out.println(f1);
      Elements c1 = discussionlink.select("p");
      String s1 = c1.toString();
      String comments[] = s1.split("</p>\n<p>");
      
      for (int i = 0; i < comments.length; i++) {
        comments[i] = comments[i].replaceAll("\\<[^>]*>", "");
        comments[i] = comments[i].replace(",", "");
        bw.write(comments[i] + " ");
        bw1.write(comments[i]);
        bw1.newLine();
        count++;
        if (count == 500) {
          Elements time = doc.select("span.date_time");
          String f2 = time.toString();
          String s2 = f2.replace("<span class=\"date_time\">", "")
          .replace("</span>", "");
          String times[] = s2.split("\n");
          bw.write("," + times[0]);
          bw.close();
          file_count++;
          count = 0;
          bw = new BufferedWriter(new FileWriter(
            "G:/EclipseWorkspace/scrapping/Bhopal_output_"
            + file_count + ".csv"));
        }
      }
      v++;
    } while (v <= 5000);
    bw1.close();
  }
  
  private static String getUrlSource(String govlink) throws IOException {
    // System.out.println("inside geturlsource");
    URL userurl = new URL(govlink);
    HttpsURLConnection input= (HttpsURLConnection) userurl
    .openConnection();
    BufferedReader in = new BufferedReader(new InputStreamReader(
      input.getInputStream(), "UTF-8"));
    String inputLine;
    StringBuilder a = new StringBuilder();
    while ((inputLine = in.readLine()) != null)
      a.append(inputLine);
    in.close();
    
    return a.toString();
  }
  
}

# aspell based spell_checker

raw_data=read.csv("G:\\EclipseWorkspace\\scrapping\\comments.csv",header=FALSE)
nrow(raw_data)
doc=500
comment=65
data=data.frame(Comment=NA)
for(i in 1:doc){
  print(i)
  raw_data2=paste(raw_data[(comment*(i-1)+1):(comment*i),1],collapse=" ")
  words <- unlist(strsplit(raw_data2, "\\s+"))
  words <- words[!words %in% dictionary]
  for(c in 1:length(words)){
    asp=aspell(as.factor(words[c]))
    if(length(asp$Suggestion)==1){
      if(class(unlist(asp$Suggestion))!="NULL"){
        words[c]=unlist(asp$Suggestion)[1]
      }
    }
  }
  raw_data3=paste(words,collapse=" ")
  data=rbind(data,raw_data3) 
}


# spell_checker based on soundex encoding and full Damerau-Levenshtein distance

# read in big.txt, a 6.5 mb collection of different english texts.
raw_text <- paste(readLines("/Users/shashank/Documents/PGDBA/ISI\ 1st\ Sem/Computing\ for\ Data\ Science/Project/big.txt"), collapse = " ")
# make the text lowercase and split it up creating a huge vector of word tokens.
split_text <- strsplit(tolower(raw_text), "[^a-z]+")
# count the number of different type of words.
word_count <- table(split_text)
# sort the words and create an ordered vector with the most common type of words first.
sorted_words <- names(sort(word_count, decreasing = TRUE))

mycorrect <- function(word,dist=4) {
  # Calculate the edit distance between the word and all other words in sorted_words.
  soundex_dist <- stringdist(word, sorted_words,method = "soundex")
  soundex_filtered_words =sorted_words[which(soundex_dist==0)]
  dl_dist <- stringdist(word, soundex_filtered_words,method = "dl")
  # Calculate the minimum edit distance to find a word that exists in big.txt 
  # with a limit of two edits.
  min_edit_dist <- min(dl_dist, dist)
  # Generate a vector with all words with this minimum edit distance.
  # Since sorted_words is ordered from most common to least common, the resulting
  # vector will have the most common / probable match first.
  proposals_by_prob <- c(soundex_filtered_words[ dl_dist <= min_edit_dist])
  # In case proposals_by_prob would be empty we append the word to be corrected...
  proposals_by_prob <- c(proposals_by_prob, word)
  # ... and return the first / most probable word in the vector.
  proposals_by_prob[1]
}


################################## End ################################################