TEXT (.txt)
This working script was tested on R 3.2.5. Code is adapted from https://github.com/gimoya/theBioBucket-Archives/blob/master/R/txtmining_pdf.R. It reads a text file, processes it to remove unnecessary words and plots it.
Code:
library(tm)
library(wordcloud)
library(Rstem)
filetxt <- "C:\\Users\\310211146\\Documents\\Other\\May_Report.txt"
txt <- readLines(filetxt)
txt <- tolower(txt)
txt <- removeWords(txt, c("\\f", stopwords()))
corpus <- Corpus(VectorSource(txt))
corpus <- tm_map(corpus, removePunctuation)
tdm <- TermDocumentMatrix(corpus)
m <- as.matrix(tdm)
d <- data.frame(freq = sort(rowSums(m), decreasing = TRUE))
d$stem <- wordStem(row.names(d), language = "english")
d$word <- row.names(d)
d <- d[nchar(row.names(d)) < 20,]
agg_freq <- aggregate(freq ~ stem, data = d, sum)
agg_word <- aggregate(word ~ stem, data = d, function(x)
x[1])
d <- cbind(freq = agg_freq[, 2], agg_word)
d <- d[order(d$freq, decreasing = T),]
wordcloud(d$word, d$freq)
library(wordcloud)
library(Rstem)
filetxt <- "C:\\Users\\310211146\\Documents\\Other\\May_Report.txt"
txt <- readLines(filetxt)
txt <- tolower(txt)
txt <- removeWords(txt, c("\\f", stopwords()))
corpus <- Corpus(VectorSource(txt))
corpus <- tm_map(corpus, removePunctuation)
tdm <- TermDocumentMatrix(corpus)
m <- as.matrix(tdm)
d <- data.frame(freq = sort(rowSums(m), decreasing = TRUE))
d$stem <- wordStem(row.names(d), language = "english")
d$word <- row.names(d)
d <- d[nchar(row.names(d)) < 20,]
agg_freq <- aggregate(freq ~ stem, data = d, sum)
agg_word <- aggregate(word ~ stem, data = d, function(x)
x[1])
d <- cbind(freq = agg_freq[, 2], agg_word)
d <- d[order(d$freq, decreasing = T),]
wordcloud(d$word, d$freq)
Output:
PDF (.pdf)
I have coded following script based on the reference mentioned above and other three references as well that are :
Code:
library(tm)
library(wordcloud)
library(Rstem)
library(SnowballC)
Rpdf <- readPDF(control = list(text = "-layout"))
corpus <-
Corpus(
URISource("C:\\Users\\310211146\\Documents\\PDF\\May_Report.pdf"),
readerControl = list(reader = Rpdf)
)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
tdm <- TermDocumentMatrix(corpus)
m <- as.matrix(tdm)
d <- data.frame(freq = sort(rowSums(m), decreasing = TRUE))
wordcloud(row.names(d), d$freq, colors = brewer.pal(7, "Dark2"))
library(wordcloud)
library(Rstem)
library(SnowballC)
Rpdf <- readPDF(control = list(text = "-layout"))
corpus <-
Corpus(
URISource("C:\\Users\\310211146\\Documents\\PDF\\May_Report.pdf"),
readerControl = list(reader = Rpdf)
)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
tdm <- TermDocumentMatrix(corpus)
m <- as.matrix(tdm)
d <- data.frame(freq = sort(rowSums(m), decreasing = TRUE))
wordcloud(row.names(d), d$freq, colors = brewer.pal(7, "Dark2"))
Output:
No comments:
Post a Comment
Note: Only a member of this blog may post a comment.