# This code is part of the course project tiled, 'Topic Transition Modeling' # for 'Computing for Data Sciences' course carried out in PGDBA program @ ISI, Kolkata # during fall 2015, under the supervision of Sourav Sen Gupta (sg.[firstname]@gmail.com). # Authors: # Aashish Jhamtani # Pradeep Mooda # Somya Ranjan Sahu # Shashank Kumar ([firstname].inquest[at][same mail server as above]) # loading the required libraries library(twitteR) library(tm) library(wordcloud) library(RColorBrewer) library(topicmodels) library(lda) library(MASS) library(openNLP) library(NLP) library(R.utils) library(msm) library(stringdist) library(igraph) library(stats) library(utils) library(devtools) devtools::install_github("hadley/stringr") ################## processing input comments and creating a corpus ##################### # reading the scraped comments dumped in .csv format f <- "/Users/shashank/Documents/PGDBA/ISI\ 1st\ Sem/Computing\ for\ Data\ Science/Project/comments_edited.csv" df_tmp <- read.csv(f) # reading the list of hindi words available h_dictionary=read.csv("/Users/shashank/Documents/PGDBA/ISI\ 1st\ Sem/Computing\ for\ Data\ Science/Project/HindiDictionary.csv") h_dictionary=as.character(h_dictionary[,1]) chunk <- 5 # no. of comments clubbed together to represent one timestamp df_comments <- data.frame(comment=NA) i <- 0 # initializing a spelling correction lookup table used for saving all corrections for # future reference and learning corrected_lookup <- matrix(data = c("guys","guy"),ncol = 2) corrected_lookup <- read.table("/Users/shashank/Documents/PGDBA/ISI\ 1st\ Sem/Computing\ for\ Data\ Science/Project/corrected_lookup.csv") # segregating group of comments in one line and spell-checking each word using native mycorrect function while (chunk*(i+1) <= nrow(df_tmp)) { c <- paste(df_tmp[((chunk*i)+1):(chunk*(i+1)),1], collapse = " ") c <- gsub("[[:punct:]]", " ",c) words <- tolower(unlist(strsplit(c, "\\s+"))) words <- words[!words %in% h_dictionary] corrected_words <- c() for (w in words) { if (is.na(match(w,sorted_words))) { if (is.na(match(w,corrected_lookup[,1]))) { rep <- mycorrect(w) corrected_words <- c(corrected_words,rep) if (!(rep==w)) { corrected_lookup <- rbind(corrected_lookup,cbind(w,rep)) } } else { corrected_words <- c(corrected_words,corrected_lookup[match(corrected_lookup[,1],w),1]) } } else { corrected_words <- c(corrected_words,w) } } df_comments <- rbind(df_comments,paste0(corrected_words, collapse = " ")) i <- i+1 } df_comments <- as.data.frame(df_comments[-1,]) # writing out list of incorrect words and their corrected forms for future reference and learning write.table(as.data.frame(corrected_lookup),file = "/Users/shashank/Documents/PGDBA/ISI\ 1st\ Sem/Computing\ for\ Data\ Science/Project/corrected_lookup.csv",col.names = NA) # parts-of-speech tagging and collecting nouns str(df_comments) content=df_comments[,1] class(content) x <- matrix(data="",nrow = length(content), ncol = 2) colnames(x)=c("Content","Nouns+verbs+Adjectives") Maxent_Sent_Token_Annotator(language = "en", probs = FALSE, model = NULL) sent_token_annotator <- Maxent_Sent_Token_Annotator() word_token_annotator <- Maxent_Word_Token_Annotator() pos_tag_annotator <- Maxent_POS_Tag_Annotator() class(content) for(j in 1:length(content)) { trim <- function (a) gsub("^\\s+|\\s+$", "", a) s=trim(content[j]) if(nchar(s)==0 || is.na(s)) { next } s=as.character(s) s <- paste(c(s), collapse = "") s <- as.String(s) a2 <- annotate(s, list(sent_token_annotator, word_token_annotator)) a3 <- annotate(s, pos_tag_annotator, a2) a3w <- subset(a3, type == "word") tags <- sapply(a3w$features, `[[`, "POS") class(a3w) class(a3w$features) a3wdf=as.data.frame(a3w) a4w=cbind(a3wdf,tags) i=1 x[j,1]=as.character(data[j,1]) for(i in 1:nrow(a4w)) { if(a4w[i,6]=="NN"|a4w[i,6]=="NNS"|a4w[i,6]=="NNP"|a4w[i,6]=="NNPS"){ x[j,2]=paste((x[j,2]),(s[a4w[i,3],a4w[i,4]]),sep=" ") } } } # creating a corpus from the pre-processed data frame and further post-processing mach_corpus = Corpus(VectorSource(x[,2])) myCorpus <- tm_map(mach_corpus,content_transformer(function(x) iconv(x, to='UTF-8-MAC', sub='byte')), mc.cores=1) removeURL <- function(x) gsub("http[[:alnum:]]*", "", x) myCorpus <- tm_map(myCorpus, removeURL) myCorpus <- tm_map(myCorpus, tolower) myCorpus <- tm_map(myCorpus, removePunctuation) myCorpus <- tm_map(myCorpus, removeNumbers) myCorpus <- tm_map(myCorpus, PlainTextDocument) # creating document term matrix myDtm <- DocumentTermMatrix(myCorpus, control = list(wordLengths=c(2,Inf), weighting=weightTf)) #myDtm_smart <- DocumentTermMatrix(myCorpus, control = list(wordLengths=c(2,Inf), weighting= function(x) weightSMART(x,spec = "Lnn"))) # tfidf weighing in dtm myDtm_tfidf <- DocumentTermMatrix(myCorpus, control = list(wordLengths=c(2,Inf), weighting=weightTfIdf)) myDtm$i <- myDtm_tfidf$i myDtm$j <- myDtm_tfidf$j myDtm$v <- as.integer(trunc((myDtm_tfidf$v)*10^(ceiling(log10(1/min((myDtm_tfidf$v))))))) ######################################################################################## ######################## lda and epistemic mapping of topics ########################## dtm <- myDtm rowTotals <- apply(dtm, 1, sum) dtm2 <- dtm[rowTotals>0,] dim(dtm2) # running lda with 10 topics dtm_LDA <- LDA(dtm2, 10) # getting 10 most relevant keywords for each topic, just for a quick look get_terms(dtm_LDA, 10) # dictionary-topic mapping based on similarity of relevant words in the reference topics # and topics found by applying lda over the corpus of comments # loading the distribution of words for each reference topic already created topics_path <- "/Users/shashank/Documents/PGDBA/ISI\ 1st\ Sem/Computing\ for\ Data\ Science/Project/Topics/" filenames <- list.files(topics_path,pattern="*.csv") f_path <- paste0(topics_path,filenames,sep="") dic <- list() for (i in 1:length(filenames)) { dic[[i]] <- as.matrix(read.csv(f_path[i],sep = ",")) cum_prob <- 0 crt_len <- 0 for (j in 1:nrow(dic[[i]])) { cum_prob = cum_prob + as.numeric(dic[[i]][j,3]) if (cum_prob>=0.95) { crt_len <- j break } } if (crt_len!=0) { dic[[i]] <- as.matrix(read.csv(f_path[i],sep = ","))[1:crt_len,2:3] } else { dic[[i]] <- as.matrix(read.csv(f_path[i],sep = ","))[,2:3] } dic[[i]][,2] <- matrix(as.numeric(dic[[i]][,2])/sum(as.numeric(dic[[i]][,2])),ncol = 1) } dic_len=nrow(dic[[1]]) lda_topic_words <- list() dtm_LDA_beta <- exp(dtm_LDA@beta) for (i in 1:nrow(dtm_LDA@beta)) { tmp_beta <- sort(dtm_LDA_beta[i,],decreasing = TRUE) cum_prob <- 0 crt_len <- 0 for (j in 1:dic_len) { cum_prob <- cum_prob+tmp_beta[j] if (cum_prob>0.95) { crt_len <- j break } } if (crt_len!=0) { lda_topic_words[[i]] <- t(as.matrix(rbind(get_terms(dtm_LDA,crt_len),tmp_beta[1:crt_len]))) } else { lda_topic_words[[i]] <- as.matrix(cbind(as.vector(get_terms(dtm_LDA,dic_len)),matrix(tmp_beta[1:dic_len],ncol = 1))) } lda_topic_words[[i]][,2] <- matrix(as.numeric(lda_topic_words[[i]][,2])/sum(as.numeric(lda_topic_words[[i]][,2])),ncol = 1) } # calculating the similarity of topics in the corpus of comments with every dictionary # topic based on the dot product of probability values for same words in the topics smlr <- matrix(data = NA,ncol = length(dic),nrow = dtm_LDA@k) topic_dic_index <- matrix(data = NA,ncol = 1,nrow = dtm_LDA@k) for (i in 1:dtm_LDA@k) { for (j in 1:length(dic)) { common_lexicon <- unique(c(lda_topic_words[[i]][,1],dic[[j]][,1])) s <- 0 for (w in common_lexicon) { if (!(is.na(match(w,lda_topic_words[[i]][,1]))) & !(is.na(match(w,dic[[j]][,1])))) { s <- s + as.numeric(lda_topic_words[[i]][(match(w,lda_topic_words[[i]][,1])),2])*as.numeric(dic[[j]][(match(w,dic[[j]][,1])),2]) } } smlr[i,j] <- s } topic_dic_index[i,1] <- gsub("+\\s.csv","",filenames[which.max(smlr[i,])]) } ####################################################################################### #################### msm based markov chain modeling ################################## # creating the batch of data required by msm for markov chain modeling n_indv <- 1 time_factor <- 1 epoch <- length(dtm_LDA@documents) time_frame <- as.integer(trunc(epoch*time_factor)) topic_set <- seq(1,dtm_LDA@k,1) df_msm <- data.frame(Ints=integer(), Ints=integer(), Ints=integer()) for (i in 1:n_indv) { df_time <- t(t(sort(sample(seq(1,epoch,1),size = time_frame,replace = FALSE)))) df_topic <- c() for (j in df_time) { t_prob <- dtm_LDA@gamma[j,] # df_topic <- c(df_topic,sample(topic_set,size = 1,prob = t_prob)) df_topic <- c(df_topic,which.max(t_prob)) } df_topic <- t(t(df_topic)) df_tmp <- cbind((vector(length = time_frame)+i),df_time,df_topic) df_msm <- rbind(df_msm,df_tmp) } colnames(df_msm) <- c("id","time","state") # creating initial qmatrix and transition probability matrix as required by msm qmatrix <- mat.or.vec(length(topic_set),length(topic_set))+1 init_tp <- statetable.msm(state, id, data=df_msm) init_tp <- init_tp/rowSums(init_tp) crudeinits.msm(state ~ time, id, data=df_msm, qmatrix=qmatrix) # running msm for fitting DTMC using 'bfgs' optimizer and other parameters to esure # convergence while mainting accuracy of the maximum likihood estimates for the # transition probability parameters topic.msm <- msm( formula = state ~ time, subject = id, data = df_msm,qmatrix = qmatrix,gen.inits = TRUE, obstype = 2,exacttimes = TRUE,control = list(fnscale = 2500, reltol = 1e-16,ndeps=rep(1e-6, 90))) # splitting the chain in lengths of 100 time units to study the time in-homogeneous # nature of the markov chain under time-series model pmatrix_list=list() topic.msm_list=list() pointer=1 i=1 for (j in seq(100,nrow(df_msm),100)) { topic.msm <- msm( formula = state ~ time, subject = id, data = df_msm[pointer:j,],qmatrix = qmatrix,gen.inits = TRUE, obstype = 2,exacttimes = TRUE,method = "CG") topic.msm_list[[i]] = topic.msm pmatrix_list[[i]] = as.matrix(pmatrix.msm(topic.msm)) pointer=j+1 i=i+1 } ptrans_mat =matrix(data = 0,nrow = length(pmatrix_list),ncol = length(pmatrix_list[[1]]) ) for (i in (1:length(pmatrix_list))) { ptrans_mat[i,] = c(pmatrix_list[[i]]) } # plotting the distribution of a few transition probability numbers over time qplot(seq(1,64,1),ptrans_mat[,1],main="Distribution of p(1,1) over time", xlab="Time Window",ylab="p(1,1)") qplot(seq(1,64,1),ptrans_mat[,10],main="Distribution of p(10,1) over time", xlab="Time Window",ylab="p(10,1)") qplot(seq(1,64,1),ptrans_mat[,91],main="Distribution of p(1,10) over time", xlab="Time Window",ylab="p(1,10)") qplot(seq(1,64,1),ptrans_mat[,100],main="Distribution of p(10,10) over time", xlab="Time Window",ylab="p(10,10)") ####################################################################################### ################## visualization in igraph ###################################################### test_nodes <- read.csv("G:\\PGDBA\\ISI\\Computing for Data Science\\CDS Project\\graphs\\nodes.csv", header=F, as.is=T) test_links <- read.csv("G:\\PGDBA\\ISI\\Computing for Data Science\\CDS Project\\graphs\\links.csv", header=F, as.is=T) transition <- read.csv("G:\\PGDBA\\ISI\\Computing for Data Science\\CDS Project\\graphs\\transition_matrix.csv", header=T, as.is=T) transition <- transition[,2:length(transition)] trans_mat <- as.matrix(transition) trans=c() for(i in 1:length(transition)){ trans = append(trans,as.matrix(transition)[i,]) } x_coor=c(2,2,4,4) y_coor=c(1,4,1,4) l=as.matrix(cbind(x_coor,y_coor)) state1=t(as.matrix(test_nodes[,2])) step=10 docs=4 png(file="G:\\PGDBA\\ISI\\Computing for Data Science\\CDS Project\\graphs\\example%03d.png", width=1600,height=900) for(i in 1:docs){ state2=state1%*%trans_mat for(j in 0:step){ temp_nodes=state1+(state2-state1)*j/step temp_nodes=t(temp_nodes) temp_nodes=as.data.frame(cbind(c("","T2","T3","T4"),temp_nodes)) rownames(temp_nodes)=NULL temp_nodes$V2=as.numeric(levels(temp_nodes$V2))[temp_nodes$V2] test_net <- graph.data.frame(test_links, temp_nodes, directed=T) #test_net #V(test_net)$name #V(test_net)$V2 #to remove links to self #plot(test_net) #plot(test_net, edge.arrow.size=.4) #colrs <- c("Red", "Green", "Blue", "Yellow") color=c(rgb((1-round(100*V(test_net)$V2[1])/100),(1-round(100*V(test_net)$V2[1])/100),(round(100*V(test_net)$V2[1])/100)), rgb((1-round(100*V(test_net)$V2[2])/100),(1-round(100*V(test_net)$V2[2])/100),(round(100*V(test_net)$V2[2])/100)), rgb((1-round(100*V(test_net)$V2[3])/100),(1-round(100*V(test_net)$V2[3])/100),(round(100*V(test_net)$V2[3])/100)), rgb((1-round(100*V(test_net)$V2[4])/100),(1-round(100*V(test_net)$V2[4])/100),(round(100*V(test_net)$V2[4])/100))) # plot(test_net,vertex.size=200*V(test_net)$V2,edge.arrow.size=.6,layout=l,vertex.color=color, # vertex.label=as.character(V(test_net)$name),main=paste("time=",as.character(i+j/step))) test_net <- simplify(test_net, remove.multiple = F, remove.loops = T) plot(test_net,vertex.size=200*V(test_net)$V2,vertex.label=as.character(V(test_net)$name), edge.arrow.size=.6,layout=l,vertex.color=color,main=paste("time=",as.character(i+j/step))) } state1=state2 rm(temp_nodes) } ######################################################################################## ####################### native modules ################################################# # java based crawler for web scraping package myGov; import java.io.*; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.Date; import java.util.Iterator; import java.util.List; import javax.net.ssl.HttpsURLConnection; import javax.swing.text.Document; import javax.swing.text.EditorKit; import javax.swing.text.SimpleAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLDocument; import javax.swing.text.html.HTMLEditorKit; import jxl.Workbook; import jxl.write.Label; import jxl.write.WritableSheet; import jxl.write.WritableWorkbook; import org.jsoup.Jsoup; import org.jsoup.select.Elements; public class Scrap { public static void main(String[] args) throws Exception, IOException { int file_count = 83; BufferedWriter bw1 = new BufferedWriter(new FileWriter( "G:/EclipseWorkspace/scrapping/2_Bhopal_output.csv")); int v = 4383; int count = 0; BufferedWriter bw = new BufferedWriter(new FileWriter( "G:/EclipseWorkspace/scrapping/Bhopal_output_" + file_count + ".csv")); do { // define baseurl here String govbase = "https://mygov.in/group-issue/smart-city-bhopal/?field_hashtags_tid=&sort_by=created&sort_order=DESC&page=0%2C"; String govlink = govbase + v; System.out.println(v); // / the below code will take all links from discussions page String govres = getUrlSource(govlink); // System.out.println(govres); org.jsoup.nodes.Document doc = Jsoup.parse(govres, "UTF-8"); Elements discussionlink = doc.select("div.comment_body"); // String f1 = discussionlink.toString(); // System.out.println(f1); Elements c1 = discussionlink.select("p"); String s1 = c1.toString(); String comments[] = s1.split("

\n

"); for (int i = 0; i < comments.length; i++) { comments[i] = comments[i].replaceAll("\\<[^>]*>", ""); comments[i] = comments[i].replace(",", ""); bw.write(comments[i] + " "); bw1.write(comments[i]); bw1.newLine(); count++; if (count == 500) { Elements time = doc.select("span.date_time"); String f2 = time.toString(); String s2 = f2.replace("", "") .replace("", ""); String times[] = s2.split("\n"); bw.write("," + times[0]); bw.close(); file_count++; count = 0; bw = new BufferedWriter(new FileWriter( "G:/EclipseWorkspace/scrapping/Bhopal_output_" + file_count + ".csv")); } } v++; } while (v <= 5000); bw1.close(); } private static String getUrlSource(String govlink) throws IOException { // System.out.println("inside geturlsource"); URL userurl = new URL(govlink); HttpsURLConnection input= (HttpsURLConnection) userurl .openConnection(); BufferedReader in = new BufferedReader(new InputStreamReader( input.getInputStream(), "UTF-8")); String inputLine; StringBuilder a = new StringBuilder(); while ((inputLine = in.readLine()) != null) a.append(inputLine); in.close(); return a.toString(); } } # aspell based spell_checker raw_data=read.csv("G:\\EclipseWorkspace\\scrapping\\comments.csv",header=FALSE) nrow(raw_data) doc=500 comment=65 data=data.frame(Comment=NA) for(i in 1:doc){ print(i) raw_data2=paste(raw_data[(comment*(i-1)+1):(comment*i),1],collapse=" ") words <- unlist(strsplit(raw_data2, "\\s+")) words <- words[!words %in% dictionary] for(c in 1:length(words)){ asp=aspell(as.factor(words[c])) if(length(asp$Suggestion)==1){ if(class(unlist(asp$Suggestion))!="NULL"){ words[c]=unlist(asp$Suggestion)[1] } } } raw_data3=paste(words,collapse=" ") data=rbind(data,raw_data3) } # spell_checker based on soundex encoding and full Damerau-Levenshtein distance # read in big.txt, a 6.5 mb collection of different english texts. raw_text <- paste(readLines("/Users/shashank/Documents/PGDBA/ISI\ 1st\ Sem/Computing\ for\ Data\ Science/Project/big.txt"), collapse = " ") # make the text lowercase and split it up creating a huge vector of word tokens. split_text <- strsplit(tolower(raw_text), "[^a-z]+") # count the number of different type of words. word_count <- table(split_text) # sort the words and create an ordered vector with the most common type of words first. sorted_words <- names(sort(word_count, decreasing = TRUE)) mycorrect <- function(word,dist=4) { # Calculate the edit distance between the word and all other words in sorted_words. soundex_dist <- stringdist(word, sorted_words,method = "soundex") soundex_filtered_words =sorted_words[which(soundex_dist==0)] dl_dist <- stringdist(word, soundex_filtered_words,method = "dl") # Calculate the minimum edit distance to find a word that exists in big.txt # with a limit of two edits. min_edit_dist <- min(dl_dist, dist) # Generate a vector with all words with this minimum edit distance. # Since sorted_words is ordered from most common to least common, the resulting # vector will have the most common / probable match first. proposals_by_prob <- c(soundex_filtered_words[ dl_dist <= min_edit_dist]) # In case proposals_by_prob would be empty we append the word to be corrected... proposals_by_prob <- c(proposals_by_prob, word) # ... and return the first / most probable word in the vector. proposals_by_prob[1] } ################################## End ################################################