Fisseha Berhane, PhD

Data Scientist

443-970-2353 fisseha@jhu.edu CV Resume Linkedin GitHub twitter twitter

PDF Mining with R using Shiny





A PDF document is not so great in terms of searching and indexing and it becomes an overwhelming task to search through many documents individually or compare two or more documents manually.

This application helps to get useful insights from PDF documents by creating visualizations and summarizations. It also enables searching, sorting and filtering. We can browse through lots of documents in a single click and get a summary and comparison of the documents instantly.

We can upload books, journals, surveys, ...

You can try the app by clicking here.


Shiny apps have at least two parts. A server.R, which is R script, and ui.R, which controls the look and feel of the user interface.

My server.R and ui.R codes are given below. They are also on GitHub.

ui.R

In [ ]:
library(shiny)
library(shinydashboard)
library(shinythemes)
library(plotly)
dashboardPage(skin="blue",
              dashboardHeader(title="PDF Extractor",titleWidth=300),
              
              dashboardSidebar(width=250,
                               sidebarMenu(
                                 br(),
                                 menuItem(tags$em("Provide PDFs",style="font-size:150%"),icon=icon("upload"),tabName="data"),
                                 menuItem(tags$em("Summaries",style="font-size:150%"),icon=icon("bar-chart-o"),tabName="summary"),
                                 menuItem(tags$em("Search and Filter",style="font-size:150%"),icon=icon("search"),tabName="search")
                                 
                                 
                                 
                                 )
                                 ),
              
              dashboardBody(
                tabItems(
                  tabItem(tabName="data",
                           
                          
                          br(),
                          br(),
                          tags$h4("A PDF document is not so great in terms of searching and indexing
                                  and it becomes an overwhelming task to search through many documents 
                                  individually or compare two or more documents manually."),
                        


                      tags$h4("This application helps to get useful insights from PDF documents 
                           by creating visualizations and summarizations. It also enables searching, sorting and filtering. 
                           We can browse through lots of documents in a single click and
                           get a summary and comparison of the documents in minutes.",style="color:#009900"),

                      tags$h4("Upload PDF documents (books, journals, surveys,etc.) from disk and/or provide links of PDF documents. 
                              Then go to the", tags$span("Summaries",style="color:red"), tags$span("section in the sidebar to get summaries of the uploaded documents.
                              We can search one or more terms and see their distribution across the uploaded documents in 
                              the" , tags$span("Search and Filter",style="color:red"), tags$span("menu item. We can also filter to display words with certain frequency range only."))),
                      
                      
                          br(),
                          br(),
                          br(),
                          
                          column(width = 5,
                          textInput("link", label = p("Provide link(s) to your PDF document(s) separated by comma",style="text-align:center;color:#990099;font-size:110%"),
                                    value = ""),
                          
                                         br()
                          ),
                          
                          column(width = 3,
                                         
                                    tags$h3("And/Or", style="text-align:center;color:blue;font-size:120%"),
                          br()
                         
                          ),
                          column(width = 4,
                          fileInput('file1', em('Choose PDF File',style="text-align:center;color:red;font-size:120%"),multiple = TRUE,
                                    accept=c('.pdf')),
                          
                          br()
                          ),
                          br()
                          
                        ),
                  
                  
                  tabItem(tabName="summary",
                          fluidRow(
                            tabBox(width=12,
                                   tabPanel(tags$em("Word Cloud",style="font-size:150%"),
        
                                            column(width = 8,
                                          plotOutput("wordcloud")),
                                            
                                          column(width = 4,
                                                 br(),
                                          uiOutput("minfreq"),
                                              br(),
                                            uiOutput("maxwords"),
                                              br(),
                                          uiOutput("forEach"))
                                            
                                            ),
                                   
                                   
                                  tabPanel(tags$em("Plotly Bar graph",style="font-size:150%"),
                                            
                                            plotlyOutput("myplot",height = "700px"),
                                           br(),
                                   
                                              uiOutput("numwords")
                                           
                                           )
                                   
                                   
                                            
                                            
                                            ))),
             
                  
                  
                  
                  
                  tabItem(tabName="search",
                        
                            
                            DT::dataTableOutput("DataTable"),
                             br(),
                             uiOutput("text"),
                            uiOutput('forsearch'),
                            uiOutput('searchbutton'),
                            plotlyOutput("searched",height = '600px')
                          
                            
                            
                                       
                                       )
                          )))

server.R

In [ ]:
library(shiny)
library(pdftools)
library(stringr)
library(stringi)
library(tm)
library(ggplot2)
library(dplyr)
library(wordcloud)
library(plotly)
library(DT)



shinyServer(function(input, output) {

  options(shiny.maxRequestSize=800*1024^2) 
  

  mypdf1_list <- reactive({
    
   
        withProgress({
         
          if(nchar(input$link)>0){
            setProgress(message = "Downloading Document...")
            

                
                address=unlist(strsplit(input$link,","))
              
          
            pdfs=list()
            
            for(i in 1:length(address)){
              pdfs[[i]]= pdf_text(str_trim(address[i]))
            }
            
            pdfs
            
               }else(return(NULL))
          
        })
    
    })


  
  

mypdf2_list<-reactive({
    
    inFile <- input$file1
    
    if (is.null(inFile)){
      return(NULL)
    }else{
      
      withProgress({
        setProgress(message = "Extracting Text...")
        
      lst=list()
      for(i in 1:length(inFile[,1])){
        lst[[i]] <- pdf_text(inFile[[i, 'datapath']])
      }
      
      lst
      
      })
    }
    
  })
  




documents<-reactive({
  
  
  inFile <- input$file1
  
  c(unlist(strsplit(input$link,",")),inFile$name)
  
})






mymatrix<-reactive({


  withProgress({
    setProgress(message = "Processing corpus...")
  
  txt=c(unlist(mypdf1_list()), unlist(mypdf2_list()))

  
   if(is.null(txt))
     
     return(NULL)
    
    # Create corpus
    corpus=Corpus(VectorSource(txt))
    # Convert to lower-case
    corpus=tm_map(corpus,tolower)
    corpus = tm_map(corpus, PlainTextDocument)
    corpus = tm_map(corpus, removePunctuation)
    
    # Remove stopwords
    corpus=tm_map(corpus,function(x) removeWords(x,stopwords("english")))
    
    frequencies = DocumentTermMatrix(corpus)
   # sparse = removeSparseTerms(frequencies,0.9)
    #sparse =as.matrix(sparse)
    
    sparse =as.matrix(frequencies)
    sparse=apply(sparse,2,sum)
    sparse=sparse[order(sparse,decreasing = T)]
    
    Term=names(sparse)
    Frequency=as.vector(sparse)
    
    sparse=as.data.frame(list(Term=Term,Frequency=Frequency))
    sparse$Term = stri_trans_totitle(sparse$Term) 
    sparse
   
  })
  })




output$DataTable <-DT::renderDataTable(
  
  withProgress({
    setProgress(message = "Preparing Table...")
    datatable(
      mydataTable(), filter = 'top',  
      options = list(pageLength = 5, autoWidth = TRUE),
      rownames= FALSE
      
    )
  })
)


mydataTable<-reactive({
  
  withProgress({
    setProgress(message = "Computing...")
    if(is.null(mymatrix()))
      return(NULL)
    
    pdfs=c(mypdf1_list(), mypdf2_list())
    
    
    
    if(length(pdfs)>0){
      
      documents=documents()
      
      mydata=data.frame()
      
      for(i in 1:length(pdfs)){
        
        txt = pdfs[[i]]
        
        # Create corpus
        corpus=Corpus(VectorSource(txt))
        # Convert to lower-case
        corpus=tm_map(corpus,tolower)
        corpus = tm_map(corpus, PlainTextDocument)
        corpus = tm_map(corpus, removePunctuation)
        
        # Remove stopwords
        corpus=tm_map(corpus,function(x) removeWords(x,stopwords("english")))
        
        frequencies = DocumentTermMatrix(corpus)
        #sparse = removeSparseTerms(frequencies,1)
        sparse =as.matrix(frequencies)
        sparse=apply(sparse,2,sum)
        sparse=sparse[order(sparse,decreasing = T)]
        
        Term=names(sparse)
        Frequency=as.vector(sparse)
        
        sparse=as.data.frame(list(Term=Term,Frequency=Frequency))
        sparse$Term = stri_trans_totitle(sparse$Term) 
        sparse$Document=documents[i]
        mydata=rbind(mydata,sparse)
        
      }
      
      mydata = mydata%>%mutate(Term = factor(Term,levels = Term[order(Frequency,decreasing =T)]))
      
      if(nrow(mydata)>7000){
        head(mydata,7000)
      }else{mydata}
      
    }
  })
})


  
output$wordcloud <- renderPlot({
  
    if(is.null(mymatrix()))
      return(NULL)
    
      sparse=mymatrix()
      
    pal2 <- brewer.pal(8,"Dark2")
    
    wordcloud(sparse$Term,sparse$Frequency, min.freq=input$freq, max.words=input$max,
              random.order=FALSE,scale=c(4,0.5),
              rot.per=0.35, use.r.layout=FALSE, colors=pal2)
    
    
    
  
  
   if(input$for_each==TRUE){
      
      
      if(is.null(mymatrix()))
        return(NULL)
      
     pdfs=c(mypdf1_list(), mypdf2_list())
      
      
      
      if(length(pdfs)>0){
        
        

        
        for(i in 1:length(pdfs)){
          
          txt = pdfs[[i]]
          
          # Create corpus
          corpus=Corpus(VectorSource(txt))
          # Convert to lower-case
          corpus=tm_map(corpus,tolower)
          corpus = tm_map(corpus, PlainTextDocument)
          corpus = tm_map(corpus, removePunctuation)
          
          # Remove stopwords
          corpus=tm_map(corpus,function(x) removeWords(x,stopwords("english")))
          
          frequencies = DocumentTermMatrix(corpus)
          #sparse = removeSparseTerms(frequencies,1)
          sparse =as.matrix(frequencies)
          sparse=apply(sparse,2,sum)
          sparse=sparse[order(sparse,decreasing = T)]
          
          Term=names(sparse)
          Frequency=as.vector(sparse)
          
          sparse=as.data.frame(list(Term=Term,Frequency=Frequency))
          sparse$Term = stri_trans_totitle(sparse$Term) 
          
          
          pal2 <- brewer.pal(8,"Dark2")
          
          documents=documents()
          
          x11(title = documents[i])
          
        
          wordcloud(sparse$Term,sparse$Frequency, min.freq=input$freq, max.words=input$max,
                    random.order=FALSE,scale=c(4,0.5),
                    rot.per=0.35, use.r.layout=FALSE, colors=pal2)
      
        }
        }
    }
    
    
  })
  

toshow<-reactive({
  input$show
})


output$myplot <- renderPlotly({
  
  if(is.null(mymatrix()))
    return(NULL)
  
  maximum =toshow()
  
  sparse=mymatrix()
  sparse=sparse[1:maximum,]
  
  q=sparse%>%mutate(Term = factor(Term,levels = Term[order(Frequency,decreasing =F)]))%>%
    ggplot(aes(x=Term,y=Frequency))+geom_bar(stat='identity',color='#c2c2a3',fill='#b35900')+
    xlab("")+ggtitle('Most frequent words')+theme(plot.title = element_text(size = 16,colour="blue"))+
    coord_flip()# +theme(axis.text.x = element_text(angle=-90))
  
  
  
  p <- ggplotly(q + ylab(" ") + xlab(" "))
  x <- list(
    title = "Frequency"
  )
  y <- list(
    title = ""
  )
  p %>% layout(xaxis = x, yaxis = y)
  

  
  
})
  
  
  
mysearch<-reactive({
  
  input$search
  isolate(
    stri_trans_totitle(input$searchText)
  )
  
})



SearchMatrix<-reactive({
  
  if(is.null(mysearch()))
      return(NULL)
  
  if(is.null(mymatrix()))
    return(NULL)
  
  pdfs=c(mypdf1_list(), mypdf2_list())
  
  
  if(length(pdfs)>0){
    
  
    
  mm=data.frame()
  
  for(i in 1:length(pdfs)){
    
    txt = pdfs[[i]]
    
    # Create corpus
    corpus=Corpus(VectorSource(txt))
    # Convert to lower-case
    corpus=tm_map(corpus,tolower)
    corpus = tm_map(corpus, PlainTextDocument)
    corpus = tm_map(corpus, removePunctuation)
    
    # Remove stopwords
    corpus=tm_map(corpus,function(x) removeWords(x,stopwords("english")))
    
    frequencies = DocumentTermMatrix(corpus)
    #sparse = removeSparseTerms(frequencies,1)
    sparse =as.matrix(frequencies)
    sparse=apply(sparse,2,sum)
    sparse=sparse[order(sparse,decreasing = T)]
    
    Term=names(sparse)
    Frequency=as.vector(sparse)
    
    sparse=as.data.frame(list(Term=Term,Frequency=Frequency))
    sparse$Term = stri_trans_totitle(sparse$Term) 
    
    if(stri_trans_totitle(mysearch())%in%sparse$Term==TRUE){
        total= filter(sparse,Term==stri_trans_totitle(mysearch()))$Frequency
        }else{total=0}
    
    mm=rbind(mm,total)
    
  }
  
  Documents=documents()
  names(mm)=c('Frequency')
  
  mm=cbind(Documents,mm)
  mm
  }
  
})



output$searched<-renderPlotly({
  
  if(is.null(SearchMatrix()))
    return(NULL)

  
  if(is.na(SearchMatrix()))
    return(NULL)
  
  if(nchar(SearchMatrix())==0)
    return(NULL)
  
  mm=SearchMatrix()
  
  q=mm%>%mutate(Documents = factor(Documents,levels = Documents[order(Frequency,decreasing =T)]))%>%
    ggplot(aes(x=Documents,y=Frequency))+geom_bar(stat='identity',color='#c2c2a3',fill='#999966',width=.5)+
    xlab("")+ggtitle('Search Term Frequency by Document')+ylab('')+
    theme(axis.text.x = element_text(angle=-20))+
    theme(plot.title = element_text(size = 16,colour="blue"))+
    theme(axis.text.x = element_text(colour="#4d0000",size=12))
  
  
  p = ggplotly(q)
  
  
  p
  
  
})


output$minfreq = renderUI({
  
  if(is.null(mymatrix()))
    return(NULL)
  sliderInput("freq",
              em("Minimum Frequency:",style="color:black;font-size:100%"),
              min = 1,  max = 50, value = 15)
})


output$numwords = renderUI({
  
  if(is.null(mymatrix()))
    return(NULL)
  
  sliderInput("show",
              em("Number of Words to Display:",style="color:Blue;font-size:100%"),
              min = 5,  max = 50, value = 25)
})


output$maxwords = renderUI({
  
  if(is.null(mymatrix()))
    return(NULL)
  sliderInput("max",
              em("Maximum Number of Words:",style="color:black;font-size:100%"),
              min = 1,  max = 300,  value = 200)
})


output$forEach = renderUI({
  
  if(is.null(mymatrix()))
    return(NULL)
  checkboxInput("for_each", label = p("Create Word Cloud for Each Document",style="color:#ff0000;font-size:120%" ))
})




output$forsearch = renderUI({
  
  if(is.null(mymatrix()))
    return(NULL)
  sidebarSearchForm(label = "Search...", textId="searchText", "searchButton")
})


output$text= renderUI({
  
  if(!is.null(mymatrix())){
    
  
  p("Search a word and you will see a bar graph of its frequency across all documents",
    style="text-align:center;color:#990099;font-size:110%")
  }
})



output$searchbutton = renderUI({
  
  if(is.null(mymatrix()))
    return(NULL)
  actionButton("search", "Search")
})





})
comments powered by Disqus