Also you don’t have to restart it to upload another doc - it will just start over with each doc
#| standalone: true
#| viewerHeight: 800
library(shiny)
library(readtext)
library(stringr)
library(DT)
library(bslib)
library(shinyjs) # Add shinyjs for JavaScript interactions
library(jsonlite) # For JSON handling
library(xml2) # For XML handling
library(readxl) # For Excel file handling
# Function to load keywords from GitHub repository
load_github_keywords <- function() {
# URL to the raw keywords.csv file in the GitHub repository
github_url <- "https://raw.githubusercontent.com/jhelvy/magaScreener/main/keywords.csv"
tryCatch({
# Read the CSV file directly from GitHub using read.csv
keywords_df <- read.csv(github_url, stringsAsFactors = FALSE)
# Extract the keywords column
if ("words" %in% colnames(keywords_df)) {
return(keywords_df$words)
} else {
# If column name is different, take the first column
return(keywords_df[[1]])
}
}, error = function(e) {
# If GitHub fetch fails, use the fallback hardcoded keywords
warning(paste("Error fetching keywords from GitHub:", e$message, "Using fallback keywords."))
return(get_fallback_keywords())
})
}
# Fallback hardcoded keywords (original list)
get_fallback_keywords <- function() {
return(c("accessible", "activism", "activists", "advocacy", "advocate", "advocates", "affirming care", "all-inclusive", "allyship", "anti-racism", "antiracist", "assigned at birth", "assigned female at birth", "assigned male at birth", "assigned male", "assigned female", "at risk", "barrier", "barriers", "belong", "bias", "biased", "biased toward", "biases", "biases towards", "biologically female", "biologically male", "BIPOC", "Black", "breastfeed", "breastfeed + people", "breastfeed + person", "chestfeed", "chestfeed + people", "chestfeed + person", "clean energy", "climate crisis", "climate science", "commercial sex worker", "community diversity", "community equity", "confirmation bias", "cultural competence", "cultural differences", "cultural heritage", "cultural sensitivity", "culturally appropriate", "culturally responsive", "DEI", "DEIA", "DEIAB", "DEIJ", "disabilities", "disability", "discriminated", "discrimination", "discriminatory", "disparity", "diverse", "diverse backgrounds", "diverse communities", "diverse community", "diverse group", "diverse groups", "diversified", "diversify", "diversifying", "diversity", "enhance the diversity", "enhancing diversity", "environmental quality", "equal opportunity", "equality", "equitable", "equitableness", "equity", "ethnicity", "excluded", "exclusion", "expression", "female", "females", "feminism", "fostering inclusivity", "GBV", "gender", "gender based", "gender based violence", "gender diversity", "gender identity", "gender ideology", "gender-affirming care", "genders", "Gulf of Mexico", "hate speech", "health disparity", "health equity", "hispanic minority", "historically", "identity", "immigrants", "implicit bias", "implicit biases", "inclusion", "inclusive", "inclusive leadership", "inclusiveness", "inclusivity", "increase diversity", "increase the diversity", "indigenous community", "inequalities", "inequality", "inequitable", "inequities", "inequity", "injustice", "institutional", "intersectional", "intersectionality", "key groups", "key people", "key populations", "Latinx", "LGBT", "LGBTQ", "marginalize", "marginalized", "men who have sex with men", "mental health", "minorities", "minority", "most risk", "MSM", "multicultural", "Mx", "Native American", "non-binary", "nonbinary", "oppression", "oppressive", "orientation", "uterus", "people + uterus", "people-centered care", "person-centered", "person-centered care", "polarization", "political", "pollution", "pregnant people", "pregnant person", "pregnant persons", "prejudice", "privilege", "privileges", "promote diversity", "promoting diversity", "pronoun", "pronouns", "prostitute", "race", "race and ethnicity", "racial", "racial diversity", "racial identity", "racial inequality", "racial justice", "racially", "racism", "segregation", "sense of belonging", "sex", "sexual preferences", "sexuality", "social justice", "sociocultural", "socioeconomic", "status", "stereotype", "stereotypes", "systemic", "systemically", "they/them", "trans", "transgender", "transsexual", "trauma", "traumatic", "tribal", "unconscious bias", "underappreciated", "underprivileged", "underrepresentation", "underrepresented", "underserved", "undervalued", "victim", "victims", "vulnerable populations", "woke", "women", "women and underrepresented"))
}
# Custom JS for PDF handling - optimized for analysis with explicit button trigger
js_code <- "
// Function to handle PDF extraction using PDF.js
function extractPdfText(fileInput) {
const file = fileInput.files[0];
if (!file || file.type !== 'application/pdf') {
return;
}
// Create a URL for the file
const fileURL = URL.createObjectURL(file);
// Set loading state immediately
Shiny.setInputValue('pdf_loading', true);
// Load PDF.js from CDN if not already loaded
if (typeof pdfjsLib === 'undefined') {
// Set worker source (required for PDF.js)
const script = document.createElement('script');
script.src = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.min.js';
script.onload = function() {
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';
processPdf(fileURL);
};
document.head.appendChild(script);
} else {
processPdf(fileURL);
}
// Function to process the PDF
function processPdf(url) {
pdfjsLib.getDocument(url).promise.then(function(pdf) {
let textContent = '';
let pendingPages = pdf.numPages;
// Update progress in UI with total pages info
Shiny.setInputValue('pdf_progress', { current: 0, total: pendingPages });
// Extract text from each page
for (let i = 1; i <= pdf.numPages; i++) {
pdf.getPage(i).then(function(page) {
page.getTextContent().then(function(content) {
const strings = content.items.map(item => item.str);
textContent += strings.join(' ') + '\\n';
// Update progress
Shiny.setInputValue('pdf_progress', {
current: pdf.numPages - pendingPages + 1,
total: pdf.numPages
});
// Check if all pages are processed
pendingPages--;
if (pendingPages === 0) {
// Send the extracted text back to Shiny
Shiny.setInputValue('pdf_text', textContent);
Shiny.setInputValue('pdf_loading', false);
// Clean up
URL.revokeObjectURL(url);
}
});
});
}
}).catch(function(error) {
console.error('Error loading PDF:', error);
Shiny.setInputValue('pdf_error', error.message);
Shiny.setInputValue('pdf_loading', false);
URL.revokeObjectURL(url);
});
}
}
// Initialize PDF handling when document input changes
$(document).on('change', '#document', function(e) {
// Clear previous values
Shiny.setInputValue('pdf_error', null);
Shiny.setInputValue('pdf_text', null);
const file = this.files[0];
if (file && file.type === 'application/pdf') {
// For PDFs, extract text using PDF.js
extractPdfText(this);
} else {
// For non-PDFs, ensure pdf_loading is false so analysis can proceed
Shiny.setInputValue('pdf_loading', false);
}
});
"
ui <- page_fluid(
# Theme with simplified layout
theme = bs_theme(
bootswatch = "flatly",
primary = "#2c3e50",
"navbar-bg" = "#2c3e50"
),
# Include shinyjs
shinyjs::useShinyjs(),
# Include custom JavaScript and CSS for better layout
tags$head(
tags$script(HTML(js_code)),
tags$style(HTML("
/* Simplified styles with no nested cards */
body {
padding: 15px;
}
.document-info {
background-color: #f8f9fa;
padding: 15px;
border-radius: 5px;
font-family: monospace;
margin-bottom: 20px;
}
.section-header {
font-weight: bold;
font-size: 1.2rem;
margin-top: 20px;
margin-bottom: 15px;
padding-bottom: 5px;
border-bottom: 1px solid #e9ecef;
}
/* Make the table more compact and readable */
.dataTables_wrapper {
padding: 10px 0;
}
/* Snarky message styling */
.snarky-message {
font-size: 1.1rem;
padding: 15px;
margin: 20px 0;
border-radius: 5px;
font-weight: bold;
}
.snarky-warning {
background-color: #f8d7da;
color: #721c24;
border: 1px solid #f5c6cb;
}
.snarky-success {
background-color: #d4edda;
color: #155724;
border: 1px solid #c3e6cb;
}
/* Button styling */
.action-button {
margin-top: 10px;
}
/* Layout adjustments */
.col-sm-4 {
background-color: #f8f9fa;
padding: 20px;
border-radius: 5px;
}
/* Add space between columns */
.col-sm-8 {
padding-left: 30px;
}
/* Keyword source section styling */
.keyword-source-section {
margin-top: 20px;
padding-top: 15px;
border-top: 1px solid #e9ecef;
}
"))
),
# Layout with sidebar and main content in a fluidRow
fluidRow(
# Sidebar panel
column(
width = 4,
h4("Document Upload", class = "mb-3"),
fileInput("document", "Choose Document",
accept = c(".docx", ".doc", ".pdf", ".txt", ".csv", ".html",
".htm", ".rtf", ".xml", ".json", ".xlsx", ".xls")),
# PDF processing status (conditionally shown)
conditionalPanel(
condition = "input.pdf_loading == true",
div(
class = "alert alert-info",
"Processing PDF... This may take a moment.",
tags$div(
class = "progress mt-2",
tags$div(
id = "pdf-progress-bar",
class = "progress-bar progress-bar-striped progress-bar-animated",
role = "progressbar",
style = "width: 0%"
)
)
)
),
# Keyword source selection section
div(
class = "keyword-source-section",
h4("Keyword Source", class = "mb-3"),
radioButtons("keyword_source", "Select Keyword Source:",
choices = list(
"Default" = "github",
"Upload Custom Keywords" = "custom"
),
selected = "github"),
# Custom keywords file upload (conditionally shown)
conditionalPanel(
condition = "input.keyword_source == 'custom'",
fileInput("custom_keywords", "Upload Keywords CSV",
accept = c(".csv")),
tags$p(class = "text-muted", "CSV file should have a header row with 'words' column containing keywords.")
)
),
# Added analyze button for manual analysis triggering
actionButton("analyze_btn", "Analyze Document",
class = "btn-primary btn-block mt-3"),
hr(),
tags$p("Supported formats: Word (.docx, .doc), PDF, Text, CSV, HTML, XML, JSON, Excel (.xlsx, .xls), RTF"),
tags$p(class = "text-muted", "Click 'Analyze Document' after uploading to begin analysis.")
),
# Main content
column(
width = 8,
# Status message
uiOutput("status_message"),
# Document information section
conditionalPanel(
condition = "output.document_analyzed == true",
h3("Analysis Results", class = "section-header"),
# Document basic info with better styling
h4("Document Information", class = "section-header"),
div(
class = "document-info",
verbatimTextOutput("document_info")
),
# Snarky message output
htmlOutput("snarky_message"),
# Keywords table - only shown if keywords are found
conditionalPanel(
condition = "output.has_keywords == true",
h4("Keywords Found", class = "section-header"),
DT::dataTableOutput("keyword_table")
)
)
)
)
)
server <- function(input, output, session) {
# JavaScript to update progress bar
observe({
if (!is.null(input$pdf_progress)) {
progress <- input$pdf_progress
percentage <- round((progress$current / progress$total) * 100)
shinyjs::runjs(sprintf(
"$('#pdf-progress-bar').css('width', '%s%%').attr('aria-valuenow', %s)",
percentage, percentage
))
}
})
# Reactive values to store analysis results
results <- reactiveVal(NULL)
# Store PDF text when it becomes available
pdf_text <- reactiveVal(NULL)
observe({
if (!is.null(input$pdf_text)) {
pdf_text(input$pdf_text)
}
})
# Reactive value to store current keywords
current_keywords <- reactiveVal(NULL)
# Initialize keywords from GitHub when app starts
observe({
# Only load if current_keywords is NULL (first load)
if (is.null(current_keywords())) {
github_keywords <- load_github_keywords()
current_keywords(github_keywords)
}
})
# Update keywords when source changes or custom file is uploaded
observe({
# Reset when source changes
if (input$keyword_source == "github") {
github_keywords <- load_github_keywords()
current_keywords(github_keywords)
} else if (input$keyword_source == "custom" && !is.null(input$custom_keywords)) {
# Read custom keywords from uploaded file
tryCatch({
custom_file <- input$custom_keywords
custom_keywords_df <- read.csv(custom_file$datapath, stringsAsFactors = FALSE)
# Extract keywords from the dataframe
if ("words" %in% colnames(custom_keywords_df)) {
custom_keywords <- custom_keywords_df$words
} else {
# If column name is different, take the first column
custom_keywords <- custom_keywords_df[[1]]
}
# Update current keywords
current_keywords(custom_keywords)
}, error = function(e) {
# If error occurs, show a notification and use GitHub keywords
showNotification(
paste("Error reading custom keywords:", e$message, "Using default keywords instead."),
type = "error",
duration = 10
)
github_keywords <- load_github_keywords()
current_keywords(github_keywords)
})
}
})
# Output indicator for whether document has been analyzed
output$document_analyzed <- reactive({
!is.null(results())
})
outputOptions(output, "document_analyzed", suspendWhenHidden = FALSE)
# Output indicator for whether keywords were found
output$has_keywords <- reactive({
!is.null(results()) && length(results()$keywords_found) > 0
})
outputOptions(output, "has_keywords", suspendWhenHidden = FALSE)
# Status message - updated for button-triggered analysis
output$status_message <- renderUI({
if (!is.null(input$pdf_error)) {
return(div(class = "alert alert-danger",
"PDF Error: ", input$pdf_error))
}
if (!is.null(input$pdf_loading) && input$pdf_loading) {
return(div(class = "alert alert-info",
"Processing PDF... Please wait."))
}
if (is.null(results())) {
if (is.null(input$document)) {
return(div(class = "alert alert-info",
"Please upload a document and click 'Analyze Document' to begin."))
} else {
return(div(class = "alert alert-info",
"Document uploaded. Click 'Analyze Document' to begin analysis."))
}
} else if (!is.null(results()$error)) {
return(div(class = "alert alert-danger",
"Error: ", results()$error))
} else {
return(div(class = "alert alert-success",
"Analysis complete!"))
}
})
# Function to extract text from document with improved file type support
extract_text <- function(file_path) {
# Check if we have PDF text from JavaScript
if (!is.null(input$document) &&
tolower(tools::file_ext(input$document$name)) == "pdf" &&
!is.null(pdf_text())) {
return(pdf_text())
}
# Get file extension
file_ext <- tolower(tools::file_ext(file_path))
# Handle each file type appropriately
tryCatch({
if (file_ext == "txt") {
# Plain text files
text <- readLines(file_path, warn = FALSE)
return(paste(text, collapse = "\n"))
} else if (file_ext %in% c("csv")) {
# CSV files - improved handling
df <- read.csv(file_path, stringsAsFactors = FALSE)
# Convert all columns to character for better text extraction
df[] <- lapply(df, as.character)
# Combine all cells into a single text string
text <- paste(unlist(df), collapse = " ")
return(text)
} else if (file_ext %in% c("xlsx", "xls")) {
# Excel files
sheets <- readxl::excel_sheets(file_path)
all_text <- character(0)
for (sheet in sheets) {
df <- readxl::read_excel(file_path, sheet = sheet)
# Convert to character
df[] <- lapply(df, as.character)
# Add sheet content
all_text <- c(all_text, paste("Sheet:", sheet))
all_text <- c(all_text, paste(unlist(df), collapse = " "))
}
return(paste(all_text, collapse = "\n"))
} else if (file_ext == "json") {
# JSON files
json_data <- jsonlite::fromJSON(file_path)
# Recursively extract all values from JSON
extract_values <- function(obj) {
if (is.list(obj)) {
values <- unlist(lapply(obj, extract_values))
return(paste(values, collapse = " "))
} else if (is.data.frame(obj)) {
# Convert data frame to character
obj[] <- lapply(obj, as.character)
return(paste(unlist(obj), collapse = " "))
} else if (is.vector(obj) && !is.character(obj)) {
return(paste(obj, collapse = " "))
} else {
return(obj)
}
}
text <- extract_values(json_data)
return(text)
} else if (file_ext == "xml") {
# XML files
xml_data <- xml2::read_xml(file_path)
# Extract all text content from XML nodes
nodes <- xml2::xml_find_all(xml_data, "//text()")
text <- xml2::xml_text(nodes)
return(paste(text, collapse = " "))
} else if (file_ext %in% c("html", "htm")) {
# HTML files - improved handling
html_content <- xml2::read_html(file_path)
nodes <- xml2::xml_find_all(html_content, "//text()")
text <- xml2::xml_text(nodes)
return(paste(text, collapse = " "))
} else {
# Use readtext as a fallback for other formats (docx, rtf, etc.)
text_data <- readtext::readtext(file_path)
return(text_data$text)
}
}, error = function(e) {
# Log the error
warning(paste("Error extracting text from document:", e$message))
return("")
})
}
# Function to find keywords in text
find_keywords <- function(text, keywords) {
text_lower <- tolower(text)
found_keywords <- character(0)
keyword_counts <- numeric(0)
for (keyword in keywords) {
# Use word boundaries to match whole words
pattern <- paste0("\\b", tolower(keyword), "\\b")
matches <- str_count(text_lower, pattern)
if (matches > 0) {
found_keywords <- c(found_keywords, keyword)
keyword_counts <- c(keyword_counts, matches)
}
}
# Create a named vector of counts
names(keyword_counts) <- found_keywords
return(list(
keywords = found_keywords,
counts = keyword_counts
))
}
# Analyze document when button is clicked
observeEvent(input$analyze_btn, {
# Ensure a document is uploaded
if (is.null(input$document)) {
results(list(error = "Please upload a document first."))
return()
}
# Check if PDF is still loading
if (!is.null(input$pdf_loading) && input$pdf_loading) {
results(list(error = "PDF is still processing. Please wait."))
return()
}
# Check for PDF errors
if (!is.null(input$pdf_error)) {
results(list(error = paste("PDF Error:", input$pdf_error)))
return()
}
# Check if we have keywords
if (is.null(current_keywords()) || length(current_keywords()) == 0) {
results(list(error = "No keywords available for analysis. Please check keyword source."))
return()
}
# Extract document info
doc_path <- input$document$datapath
doc_name <- input$document$name
file_type <- tolower(tools::file_ext(doc_name))
# Show progress indicator for large files
withProgress(message = 'Analyzing document...', value = 0.2, {
# Extract text from document
text <- extract_text(doc_path)
if (text == "") {
results(list(error = "Failed to extract text from the document. The file format may not be fully supported."))
return()
}
setProgress(value = 0.5, detail = "Searching for keywords...")
# Find keywords in the text using current keywords
keyword_results <- find_keywords(text, current_keywords())
# Calculate word and character counts
words <- unlist(strsplit(text, "\\s+"))
words <- words[words != ""] # Remove empty strings
setProgress(value = 0.9, detail = "Finalizing results...")
# Store the results
results(list(
document_name = doc_name,
file_type = file_type,
text = text,
total_words = length(words),
total_chars = nchar(text),
keywords_found = keyword_results$keywords,
keyword_counts = keyword_results$counts,
keyword_source = input$keyword_source,
error = NULL
))
})
})
# Reset PDF text when new file is uploaded
observeEvent(input$document, {
pdf_text(NULL)
# Do NOT reset results automatically when a new file is uploaded
# This allows users to explicitly trigger analysis with the button
})
# Output: Document info - simplified to just the basics
output$document_info <- renderText({
if (is.null(results())) return("")
# Add keyword source information
keyword_source_info <- if (results()$keyword_source == "github") {
"GitHub Repository"
} else {
"Custom Upload"
}
paste0(
"File Name: ", results()$document_name, "\n",
"File Type: ", toupper(results()$file_type), "\n",
"Word Count: ", format(results()$total_words, big.mark = ","), "\n",
"Keyword Source: ", keyword_source_info, "\n",
"Analysis Date: ", format(Sys.time(), "%Y-%m-%d %H:%M:%S")
)
})
# Snarky messages - choose randomly from a set for each case
output$snarky_message <- renderUI({
if (is.null(results())) return(NULL)
# Snarky messages for when keywords are found
warning_messages <- c(
"Uh oh, looks like your document has some words on the naughty list - no first amendment rights for you!",
"ALERT! Your document contains language that may cause conservative fainting spells!",
"WARNING: Potentially woke content detected! Hide your children!",
"Potentially woke language detected! Your document might cause pearl-clutching!",
"Snowflake trigger warning: Your document contains words that might make Tucker Carlson cry!"
)
# Snarky messages for when no keywords are found
success_messages <- c(
"Congratulations! Your document is free of scary words like \"gender\" that trigger the MAGA mind.",
"Good news! Nothing in your document will upset Fox News viewers!",
"You're safe! No words that might cause conservative heart palpitations detected.",
"Document approved for Florida schools! No scary inclusive language found!",
"Phew! Your document is officially woke-free!"
)
if (length(results()$keywords_found) > 0) {
# Keywords found - show warning message
total_keywords <- sum(results()$keyword_counts)
unique_keywords <- length(results()$keywords_found)
div(
class = "snarky-message snarky-warning",
sample(warning_messages, 1),
tags$br(),
tags$span(
style = "font-size: 0.9rem; font-weight: normal;",
paste0("Found ", total_keywords, " occurrences of ",
unique_keywords, " unique keywords")
)
)
} else {
# No keywords found - show success message
div(
class = "snarky-message snarky-success",
sample(success_messages, 1)
)
}
})
# Output: Enhanced keyword table
output$keyword_table <- DT::renderDataTable({
if (is.null(results()) || length(results()$keywords_found) == 0) return(NULL)
df <- data.frame(
Keyword = results()$keywords_found,
Occurrences = results()$keyword_counts,
stringsAsFactors = FALSE
)
# Sort by number of occurrences (descending)
df <- df[order(-df$Occurrences), ]
DT::datatable(
df,
options = list(
pageLength = 15,
order = list(list(1, 'desc')),
dom = 'tip' # table, information, and pagination (no search)
),
rownames = FALSE
) %>%
DT::formatStyle(
'Keyword',
fontWeight = 'bold'
) %>%
DT::formatStyle(
'Occurrences',
background = DT::styleColorBar(range(df$Occurrences), '#9ecae1'),
fontWeight = 'bold'
)
})
}
# Run the application
shinyApp(ui = ui, server = server)