443-970-2353
[email protected]
CV Resume
This lab is on text analytics with R using logistic regression and regression trees.
The data for the first part of the analysis comes from the 2010 TREC Legal Track
emails = read.csv("energy_bids.csv", stringsAsFactors=FALSE)
str(emails)
Look at emails
emails$email[1]
emails$responsive[1]
emails$email[2]
emails$responsive[2]
Responsive emails
table(emails$responsive)
library(tm)
corpus = Corpus(VectorSource(emails$email))
corpus[[1]]
corpus = tm_map(corpus, tolower)
corpus = tm_map(corpus, PlainTextDocument)
length(stopwords("english"))
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, removeWords, stopwords("english"))
corpus = tm_map(corpus, stemDocument)
dtm = DocumentTermMatrix(corpus)
dtm
dtm = removeSparseTerms(dtm, 0.97)
dtm
labeledTerms = as.data.frame(as.matrix(dtm))
labeledTerms$responsive = emails$responsive
str(labeledTerms)
library(caTools)
set.seed(144)
spl = sample.split(labeledTerms$responsive, 0.7)
train = subset(labeledTerms, spl == TRUE)
test = subset(labeledTerms, spl == FALSE)
library(rpart)
library(rpart.plot)
emailCART = rpart(responsive~., data=train, method="class")
prp(emailCART)
pred = predict(emailCART, newdata=test)
pred[1:10,]
pred.prob = pred[,2]
table(test$responsive, pred.prob >= 0.5)
accuracy=sum(diag(table(test$responsive, pred.prob >= 0.5)))/sum(table(test$responsive, pred.prob >= 0.5))
accuracy
table(test$responsive)
accuracy=max(table(test$responsive))/sum(table(test$responsive))
accuracy
library(ROCR)
predROCR = prediction(pred.prob, test$responsive)
perfROCR = performance(predROCR, "tpr", "fpr")
plot(perfROCR, colorize=TRUE)
From the curve, we see that taking a value of about 0.15 helps to have higher sensitivity and the false positive rate will be about 0.2.
performance(predROCR, "auc")@y.values
This shows it has about 80% probability of predicting the response correctly.
tweets = read.csv("tweets.csv", stringsAsFactors=FALSE)
str(tweets)
tweets$Negative = as.factor(tweets$Avg <= -1)
table(tweets$Negative)
library(tm)
library(SnowballC)
corpus = Corpus(VectorSource(tweets$Tweet))
corpus
corpus[[1]]
corpus = tm_map(corpus, tolower)
corpus[[1]]
corpus = tm_map(corpus, PlainTextDocument)
corpus = tm_map(corpus, removePunctuation)
corpus[[1]]
stopwords("english")[1:10]
length(stopwords("english"))
corpus = tm_map(corpus, removeWords, c("apple", stopwords("english")))
corpus[[1]]
corpus = tm_map(corpus, stemDocument)
corpus[[1]]
frequencies = DocumentTermMatrix(corpus)
frequencies
inspect(frequencies[1000:1005,505:515])
findFreqTerms(frequencies, lowfreq=20)
sparse = removeSparseTerms(frequencies, 0.995)
sparse
tweetsSparse = as.data.frame(as.matrix(sparse))
colnames(tweetsSparse) = make.names(colnames(tweetsSparse)) # do this any time building a data frame from text
tweetsSparse$Negative = tweets$Negative
library(caTools)
set.seed(123)
split = sample.split(tweetsSparse$Negative, SplitRatio = 0.7)
trainSparse = subset(tweetsSparse, split==TRUE)
testSparse = subset(tweetsSparse, split==FALSE)
library(rpart)
library(rpart.plot)
tweetCART = rpart(Negative ~ ., data=trainSparse, method="class")
prp(tweetCART)
predictCART = predict(tweetCART, newdata=testSparse, type="class")
table(testSparse$Negative, predictCART) # confusion matrix
accuracy=sum(diag(table(testSparse$Negative, predictCART)))/(sum(table(testSparse$Negative, predictCART)))
accuracy
table(testSparse$Negative)
accuracy=max(table(testSparse$Negative))/(sum(table(testSparse$Negative)))
accuracy
library(randomForest)
set.seed(123)
tweetRF = randomForest(Negative ~ ., data=trainSparse)
predictRF = predict(tweetRF, newdata=testSparse)
table(testSparse$Negative, predictRF)
accuracy=(sum(diag(table(testSparse$Negative, predictRF))))/(sum(table(testSparse$Negative, predictRF)))
accuracy
tweetLR = glm(Negative ~ ., data=trainSparse, family='binomial')
predictions = predict(tweetLR, newdata=testSparse, type="response")
table(testSparse$Negative, predictions>0.5)
accuracy=(sum(diag(table(testSparse$Negative, predictions>0.5))))/(sum(table(testSparse$Negative, predictions>0.5)))
accuracy