Also you don’t have to restart it to upload another doc - it will just start over with each doc
#| standalone: true
#| viewerHeight: 800
library(shiny)
library(readtext)
library(stringr)
library(DT)
library(bslib)
library(shinyjs) # Add shinyjs for JavaScript interactions
library(jsonlite) # For JSON handling
library(xml2) # For XML handling
library(readxl) # For Excel file handling
# List of keywords
keywords <- c("accessible", "activism", "activists", "advocacy", "advocate", "advocates", "affirming care", "all-inclusive", "allyship", "anti-racism", "antiracist", "assigned at birth", "assigned female at birth", "assigned male at birth", "assigned male", "assigned remale", "at risk", "barrier", "barriers", "belong", "bias", "biased", "biased toward", "biases", "biases towards", "biologically female", "biologically male", "BIPOC", "Black", "breastfeed", "breastfeed + people", "breastfeed + person", "chestfeed", "chestfeed + people", "chestfeed + person", "clean energy", "climate crisis", "climate science", "commercial sex worker", "community diversity", "community equity", "confirmation bias", "cultural competence", "cultural differences", "cultural heritage", "cultural sensitivity", "culturally appropriate", "culturally responsive", "DEI", "DEIA", "DEIAB", "DEIJ", "disabilities", "disability", "discriminated", "discrimination", "discriminatory", "disparity", "diverse", "diverse backgrounds", "diverse communities", "diverse community", "diverse group", "diverse groups", "diversified", "diversify", "diversifying", "diversity", "enhance the diversity", "enhancing diversity", "environmental quality", "equal opportunity", "equality", "equitable", "equitableness", "equity", "ethnicity", "excluded", "exclusion", "expression", "female", "females", "feminism", "fostering inclusivity", "GBV", "gender", "gender based", "gender based violence", "gender diversity", "gender identity", "gender ideology", "gender-affirming care", "genders", "Gulf of Mexico", "hate speech", "health disparity", "health equity", "hispanic minority", "historically", "identity", "immigrants", "implicit bias", "implicit biases", "inclusion", "inclusive", "inclusive leadership", "inclusiveness", "inclusivity", "increase diversity", "increase the diversity", "indigenous community", "inequalities", "inequality", "inequitable", "inequities", "inequity", "injustice", "institutional", "intersectional", "intersectionality", "key groups", "key people", "key populations", "Latinx", "LGBT", "LGBTQ", "marginalize", "marginalized", "men who have sex with men", "mental health", "minorities", "minority", "most risk", "MSM", "multicultural", "Mx", "Native American", "non-binary", "nonbinary", "oppression", "oppressive", "orientation", "uterus", "people + uterus", "people-centered care", "person-centered", "person-centered care", "polarization", "political", "pollution", "pregnant people", "pregnant person", "pregnant persons", "prejudice", "privilege", "privileges", "promote diversity", "promoting diversity", "pronoun", "pronouns", "prostitute", "race", "race and ethnicity", "racial", "racial diversity", "racial identity", "racial inequality", "racial justice", "racially", "racism", "segregation", "sense of belonging", "sex", "sexual preferences", "sexuality", "social justice", "sociocultural", "socioeconomic", "status", "stereotype", "stereotypes", "systemic", "systemically", "they/them", "trans", "transgender", "transsexual", "trauma", "traumatic", "tribal", "unconscious bias", "underappreciated", "underprivileged", "underrepresentation", "underrepresented", "underserved", "undervalued", "victim", "victims", "vulnerable populations", "woke", "women", "women and underrepresented"
)
# Custom JS for PDF handling - optimized for analysis with explicit button trigger
js_code <- "
// Function to handle PDF extraction using PDF.js
function extractPdfText(fileInput) {
const file = fileInput.files[0];
if (!file || file.type !== 'application/pdf') {
return;
}
// Create a URL for the file
const fileURL = URL.createObjectURL(file);
// Set loading state immediately
Shiny.setInputValue('pdf_loading', true);
// Load PDF.js from CDN if not already loaded
if (typeof pdfjsLib === 'undefined') {
// Set worker source (required for PDF.js)
const script = document.createElement('script');
script.src = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.min.js';
script.onload = function() {
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';
processPdf(fileURL);
};
document.head.appendChild(script);
} else {
processPdf(fileURL);
}
// Function to process the PDF
function processPdf(url) {
pdfjsLib.getDocument(url).promise.then(function(pdf) {
let textContent = '';
let pendingPages = pdf.numPages;
// Update progress in UI with total pages info
Shiny.setInputValue('pdf_progress', { current: 0, total: pendingPages });
// Extract text from each page
for (let i = 1; i <= pdf.numPages; i++) {
pdf.getPage(i).then(function(page) {
page.getTextContent().then(function(content) {
const strings = content.items.map(item => item.str);
textContent += strings.join(' ') + '\\n';
// Update progress
Shiny.setInputValue('pdf_progress', {
current: pdf.numPages - pendingPages + 1,
total: pdf.numPages
});
// Check if all pages are processed
pendingPages--;
if (pendingPages === 0) {
// Send the extracted text back to Shiny
Shiny.setInputValue('pdf_text', textContent);
Shiny.setInputValue('pdf_loading', false);
// Clean up
URL.revokeObjectURL(url);
}
});
});
}
}).catch(function(error) {
console.error('Error loading PDF:', error);
Shiny.setInputValue('pdf_error', error.message);
Shiny.setInputValue('pdf_loading', false);
URL.revokeObjectURL(url);
});
}
}
// Initialize PDF handling when document input changes
$(document).on('change', '#document', function(e) {
// Clear previous values
Shiny.setInputValue('pdf_error', null);
Shiny.setInputValue('pdf_text', null);
const file = this.files[0];
if (file && file.type === 'application/pdf') {
// For PDFs, extract text using PDF.js
extractPdfText(this);
} else {
// For non-PDFs, ensure pdf_loading is false so analysis can proceed
Shiny.setInputValue('pdf_loading', false);
}
});
"
ui <- page_fluid(
# Theme with simplified layout
theme = bs_theme(
bootswatch = "flatly",
primary = "#2c3e50",
"navbar-bg" = "#2c3e50"
),
# Include shinyjs
shinyjs::useShinyjs(),
# Include custom JavaScript and CSS for better layout
tags$head(
tags$script(HTML(js_code)),
tags$style(HTML("
/* Simplified styles with no nested cards */
body {
padding: 15px;
}
.document-info {
background-color: #f8f9fa;
padding: 15px;
border-radius: 5px;
font-family: monospace;
margin-bottom: 20px;
}
.section-header {
font-weight: bold;
font-size: 1.2rem;
margin-top: 20px;
margin-bottom: 15px;
padding-bottom: 5px;
border-bottom: 1px solid #e9ecef;
}
/* Make the table more compact and readable */
.dataTables_wrapper {
padding: 10px 0;
}
/* Snarky message styling */
.snarky-message {
font-size: 1.1rem;
padding: 15px;
margin: 20px 0;
border-radius: 5px;
font-weight: bold;
}
.snarky-warning {
background-color: #f8d7da;
color: #721c24;
border: 1px solid #f5c6cb;
}
.snarky-success {
background-color: #d4edda;
color: #155724;
border: 1px solid #c3e6cb;
}
/* Button styling */
.action-button {
margin-top: 10px;
}
/* Layout adjustments */
.col-sm-4 {
background-color: #f8f9fa;
padding: 20px;
border-radius: 5px;
}
/* Add space between columns */
.col-sm-8 {
padding-left: 30px;
}
"))
),
# Layout with sidebar and main content in a fluidRow
fluidRow(
# Sidebar panel
column(
width = 4,
h4("Document Upload", class = "mb-3"),
fileInput("document", "Choose Document",
accept = c(".docx", ".doc", ".pdf", ".txt", ".csv", ".html",
".htm", ".rtf", ".xml", ".json", ".xlsx", ".xls")),
# PDF processing status (conditionally shown)
conditionalPanel(
condition = "input.pdf_loading == true",
div(
class = "alert alert-info",
"Processing PDF... This may take a moment.",
tags$div(
class = "progress mt-2",
tags$div(
id = "pdf-progress-bar",
class = "progress-bar progress-bar-striped progress-bar-animated",
role = "progressbar",
style = "width: 0%"
)
)
)
),
# Added analyze button for manual analysis triggering
actionButton("analyze_btn", "Analyze Document",
class = "btn-primary btn-block mt-3"),
hr(),
tags$p("Supported formats: Word (.docx, .doc), PDF, Text, CSV, HTML, XML, JSON, Excel (.xlsx, .xls), RTF"),
tags$p(class = "text-muted", "Click 'Analyze Document' after uploading to begin analysis.")
),
# Main content
column(
width = 8,
# Status message
uiOutput("status_message"),
# Document information section
conditionalPanel(
condition = "output.document_analyzed == true",
h3("Analysis Results", class = "section-header"),
# Document basic info with better styling
h4("Document Information", class = "section-header"),
div(
class = "document-info",
verbatimTextOutput("document_info")
),
# Snarky message output
htmlOutput("snarky_message"),
# Keywords table - only shown if keywords are found
conditionalPanel(
condition = "output.has_keywords == true",
h4("Keywords Found", class = "section-header"),
DT::dataTableOutput("keyword_table")
)
)
)
)
)
server <- function(input, output, session) {
# JavaScript to update progress bar
observe({
if (!is.null(input$pdf_progress)) {
progress <- input$pdf_progress
percentage <- round((progress$current / progress$total) * 100)
shinyjs::runjs(sprintf(
"$('#pdf-progress-bar').css('width', '%s%%').attr('aria-valuenow', %s)",
percentage, percentage
))
}
})
# Reactive values to store analysis results
results <- reactiveVal(NULL)
# Store PDF text when it becomes available
pdf_text <- reactiveVal(NULL)
observe({
if (!is.null(input$pdf_text)) {
pdf_text(input$pdf_text)
}
})
# Output indicator for whether document has been analyzed
output$document_analyzed <- reactive({
!is.null(results())
})
outputOptions(output, "document_analyzed", suspendWhenHidden = FALSE)
# Output indicator for whether keywords were found
output$has_keywords <- reactive({
!is.null(results()) && length(results()$keywords_found) > 0
})
outputOptions(output, "has_keywords", suspendWhenHidden = FALSE)
# Status message - updated for button-triggered analysis
output$status_message <- renderUI({
if (!is.null(input$pdf_error)) {
return(div(class = "alert alert-danger",
"PDF Error: ", input$pdf_error))
}
if (!is.null(input$pdf_loading) && input$pdf_loading) {
return(div(class = "alert alert-info",
"Processing PDF... Please wait."))
}
if (is.null(results())) {
if (is.null(input$document)) {
return(div(class = "alert alert-info",
"Please upload a document and click 'Analyze Document' to begin."))
} else {
return(div(class = "alert alert-info",
"Document uploaded. Click 'Analyze Document' to begin analysis."))
}
} else if (!is.null(results()$error)) {
return(div(class = "alert alert-danger",
"Error: ", results()$error))
} else {
return(div(class = "alert alert-success",
"Analysis complete!"))
}
})
# Function to extract text from document with improved file type support
extract_text <- function(file_path) {
# Check if we have PDF text from JavaScript
if (!is.null(input$document) &&
tolower(tools::file_ext(input$document$name)) == "pdf" &&
!is.null(pdf_text())) {
return(pdf_text())
}
# Get file extension
file_ext <- tolower(tools::file_ext(file_path))
# Handle each file type appropriately
tryCatch({
if (file_ext == "txt") {
# Plain text files
text <- readLines(file_path, warn = FALSE)
return(paste(text, collapse = "\n"))
} else if (file_ext %in% c("csv")) {
# CSV files - improved handling
df <- read.csv(file_path, stringsAsFactors = FALSE)
# Convert all columns to character for better text extraction
df[] <- lapply(df, as.character)
# Combine all cells into a single text string
text <- paste(unlist(df), collapse = " ")
return(text)
} else if (file_ext %in% c("xlsx", "xls")) {
# Excel files
sheets <- readxl::excel_sheets(file_path)
all_text <- character(0)
for (sheet in sheets) {
df <- readxl::read_excel(file_path, sheet = sheet)
# Convert to character
df[] <- lapply(df, as.character)
# Add sheet content
all_text <- c(all_text, paste("Sheet:", sheet))
all_text <- c(all_text, paste(unlist(df), collapse = " "))
}
return(paste(all_text, collapse = "\n"))
} else if (file_ext == "json") {
# JSON files
json_data <- jsonlite::fromJSON(file_path)
# Recursively extract all values from JSON
extract_values <- function(obj) {
if (is.list(obj)) {
values <- unlist(lapply(obj, extract_values))
return(paste(values, collapse = " "))
} else if (is.data.frame(obj)) {
# Convert data frame to character
obj[] <- lapply(obj, as.character)
return(paste(unlist(obj), collapse = " "))
} else if (is.vector(obj) && !is.character(obj)) {
return(paste(obj, collapse = " "))
} else {
return(obj)
}
}
text <- extract_values(json_data)
return(text)
} else if (file_ext == "xml") {
# XML files
xml_data <- xml2::read_xml(file_path)
# Extract all text content from XML nodes
nodes <- xml2::xml_find_all(xml_data, "//text()")
text <- xml2::xml_text(nodes)
return(paste(text, collapse = " "))
} else if (file_ext %in% c("html", "htm")) {
# HTML files - improved handling
html_content <- xml2::read_html(file_path)
nodes <- xml2::xml_find_all(html_content, "//text()")
text <- xml2::xml_text(nodes)
return(paste(text, collapse = " "))
} else {
# Use readtext as a fallback for other formats (docx, rtf, etc.)
text_data <- readtext::readtext(file_path)
return(text_data$text)
}
}, error = function(e) {
# Log the error
warning(paste("Error extracting text from document:", e$message))
return("")
})
}
# Function to find keywords in text
find_keywords <- function(text, keywords) {
text_lower <- tolower(text)
found_keywords <- character(0)
keyword_counts <- numeric(0)
for (keyword in keywords) {
# Use word boundaries to match whole words
pattern <- paste0("\\b", tolower(keyword), "\\b")
matches <- str_count(text_lower, pattern)
if (matches > 0) {
found_keywords <- c(found_keywords, keyword)
keyword_counts <- c(keyword_counts, matches)
}
}
# Create a named vector of counts
names(keyword_counts) <- found_keywords
return(list(
keywords = found_keywords,
counts = keyword_counts
))
}
# Analyze document when button is clicked
observeEvent(input$analyze_btn, {
# Ensure a document is uploaded
if (is.null(input$document)) {
results(list(error = "Please upload a document first."))
return()
}
# Check if PDF is still loading
if (!is.null(input$pdf_loading) && input$pdf_loading) {
results(list(error = "PDF is still processing. Please wait."))
return()
}
# Check for PDF errors
if (!is.null(input$pdf_error)) {
results(list(error = paste("PDF Error:", input$pdf_error)))
return()
}
# Extract document info
doc_path <- input$document$datapath
doc_name <- input$document$name
file_type <- tolower(tools::file_ext(doc_name))
# Show progress indicator for large files
withProgress(message = 'Analyzing document...', value = 0.2, {
# Extract text from document
text <- extract_text(doc_path)
if (text == "") {
results(list(error = "Failed to extract text from the document. The file format may not be fully supported."))
return()
}
setProgress(value = 0.5, detail = "Searching for keywords...")
# Find keywords in the text
keyword_results <- find_keywords(text, keywords)
# Calculate word and character counts
words <- unlist(strsplit(text, "\\s+"))
words <- words[words != ""] # Remove empty strings
setProgress(value = 0.9, detail = "Finalizing results...")
# Store the results
results(list(
document_name = doc_name,
file_type = file_type,
text = text,
total_words = length(words),
total_chars = nchar(text),
keywords_found = keyword_results$keywords,
keyword_counts = keyword_results$counts,
error = NULL
))
})
})
# Reset PDF text when new file is uploaded
observeEvent(input$document, {
pdf_text(NULL)
# Do NOT reset results automatically when a new file is uploaded
# This allows users to explicitly trigger analysis with the button
})
# Output: Document info - simplified to just the basics
output$document_info <- renderText({
if (is.null(results())) return("")
paste0(
"File Name: ", results()$document_name, "\n",
"File Type: ", toupper(results()$file_type), "\n",
"Word Count: ", format(results()$total_words, big.mark = ","), "\n",
"Analysis Date: ", format(Sys.time(), "%Y-%m-%d %H:%M:%S")
)
})
# Snarky messages - choose randomly from a set for each case
output$snarky_message <- renderUI({
if (is.null(results())) return(NULL)
# Snarky messages for when keywords are found
warning_messages <- c(
"Uh oh, looks like your document has some words on the naughty list - no first amendment rights for you!",
"ALERT! Your document contains language that may cause conservative fainting spells!",
"WARNING: Potentially woke content detected! Hide your children!",
"Potentially woke language detected! Your document might cause pearl-clutching!",
"Snowflake trigger warning: Your document contains words that might make Tucker Carlson cry!"
)
# Snarky messages for when no keywords are found
success_messages <- c(
"Congratulations! Your document is free of scary words like \"gender\" that trigger the MAGA mind.",
"Good news! Nothing in your document will upset Fox News viewers!",
"You're safe! No words that might cause conservative heart palpitations detected.",
"Document approved for Florida schools! No scary inclusive language found!",
"Phew! Your document is officially woke-free!"
)
if (length(results()$keywords_found) > 0) {
# Keywords found - show warning message
total_keywords <- sum(results()$keyword_counts)
unique_keywords <- length(results()$keywords_found)
div(
class = "snarky-message snarky-warning",
sample(warning_messages, 1),
tags$br(),
tags$span(
style = "font-size: 0.9rem; font-weight: normal;",
paste0("Found ", total_keywords, " occurrences of ",
unique_keywords, " unique keywords")
)
)
} else {
# No keywords found - show success message
div(
class = "snarky-message snarky-success",
sample(success_messages, 1)
)
}
})
# Output: Enhanced keyword table
output$keyword_table <- DT::renderDataTable({
if (is.null(results()) || length(results()$keywords_found) == 0) return(NULL)
df <- data.frame(
Keyword = results()$keywords_found,
Occurrences = results()$keyword_counts,
stringsAsFactors = FALSE
)
# Sort by number of occurrences (descending)
df <- df[order(-df$Occurrences), ]
DT::datatable(
df,
options = list(
pageLength = 15,
order = list(list(1, 'desc')),
dom = 'tip' # table, information, and pagination (no search)
),
rownames = FALSE
) %>%
DT::formatStyle(
'Keyword',
fontWeight = 'bold'
) %>%
DT::formatStyle(
'Occurrences',
background = DT::styleColorBar(range(df$Occurrences), '#9ecae1'),
fontWeight = 'bold'
)
})
}
# Run the application
shinyApp(ui = ui, server = server)