MAGA Keyword Screener

This can take ~30s to load…just leave the window open until it loads
Also you don’t have to restart it to upload another doc - it will just start over with each doc
#| standalone: true
#| viewerHeight: 800

library(shiny)
library(readtext)
library(stringr)
library(DT)
library(bslib)
library(shinyjs)  # Add shinyjs for JavaScript interactions
library(jsonlite) # For JSON handling
library(xml2)     # For XML handling
library(readxl)   # For Excel file handling

# Function to load keywords from GitHub repository
load_github_keywords <- function() {
  # URL to the raw keywords.csv file in the GitHub repository
  github_url <- "https://raw.githubusercontent.com/jhelvy/magaScreener/main/keywords.csv"
  
  tryCatch({
    # Read the CSV file directly from GitHub using read.csv
    keywords_df <- read.csv(github_url, stringsAsFactors = FALSE)
    
    # Extract the keywords column
    if ("words" %in% colnames(keywords_df)) {
      return(keywords_df$words)
    } else {
      # If column name is different, take the first column
      return(keywords_df[[1]])
    }
  }, error = function(e) {
    # If GitHub fetch fails, use the fallback hardcoded keywords
    warning(paste("Error fetching keywords from GitHub:", e$message, "Using fallback keywords."))
    return(get_fallback_keywords())
  })
}

# Fallback hardcoded keywords (original list)
get_fallback_keywords <- function() {
  return(c("accessible", "activism", "activists", "advocacy", "advocate", "advocates", "affirming care", "all-inclusive", "allyship", "anti-racism", "antiracist", "assigned at birth", "assigned female at birth", "assigned male at birth", "assigned male", "assigned female", "at risk", "barrier", "barriers", "belong", "bias", "biased", "biased toward", "biases", "biases towards", "biologically female", "biologically male", "BIPOC", "Black", "breastfeed", "breastfeed + people", "breastfeed + person", "chestfeed", "chestfeed + people", "chestfeed + person", "clean energy", "climate crisis", "climate science", "commercial sex worker", "community diversity", "community equity", "confirmation bias", "cultural competence", "cultural differences", "cultural heritage", "cultural sensitivity", "culturally appropriate", "culturally responsive", "DEI", "DEIA", "DEIAB", "DEIJ", "disabilities", "disability", "discriminated", "discrimination", "discriminatory", "disparity", "diverse", "diverse backgrounds", "diverse communities", "diverse community", "diverse group", "diverse groups", "diversified", "diversify", "diversifying", "diversity", "enhance the diversity", "enhancing diversity", "environmental quality", "equal opportunity", "equality", "equitable", "equitableness", "equity", "ethnicity", "excluded", "exclusion", "expression", "female", "females", "feminism", "fostering inclusivity", "GBV", "gender", "gender based", "gender based violence", "gender diversity", "gender identity", "gender ideology", "gender-affirming care", "genders", "Gulf of Mexico", "hate speech", "health disparity", "health equity", "hispanic minority", "historically", "identity", "immigrants", "implicit bias", "implicit biases", "inclusion", "inclusive", "inclusive leadership", "inclusiveness", "inclusivity", "increase diversity", "increase the diversity", "indigenous community", "inequalities", "inequality", "inequitable", "inequities", "inequity", "injustice", "institutional", "intersectional", "intersectionality", "key groups", "key people", "key populations", "Latinx", "LGBT", "LGBTQ", "marginalize", "marginalized", "men who have sex with men", "mental health", "minorities", "minority", "most risk", "MSM", "multicultural", "Mx", "Native American", "non-binary", "nonbinary", "oppression", "oppressive", "orientation", "uterus", "people + uterus", "people-centered care", "person-centered", "person-centered care", "polarization", "political", "pollution", "pregnant people", "pregnant person", "pregnant persons", "prejudice", "privilege", "privileges", "promote diversity", "promoting diversity", "pronoun", "pronouns", "prostitute", "race", "race and ethnicity", "racial", "racial diversity", "racial identity", "racial inequality", "racial justice", "racially", "racism", "segregation", "sense of belonging", "sex", "sexual preferences", "sexuality", "social justice", "sociocultural", "socioeconomic", "status", "stereotype", "stereotypes", "systemic", "systemically", "they/them", "trans", "transgender", "transsexual", "trauma", "traumatic", "tribal", "unconscious bias", "underappreciated", "underprivileged", "underrepresentation", "underrepresented", "underserved", "undervalued", "victim", "victims", "vulnerable populations", "woke", "women", "women and underrepresented"))
}

# Custom JS for PDF handling - optimized for analysis with explicit button trigger
js_code <- "
// Function to handle PDF extraction using PDF.js
function extractPdfText(fileInput) {
  const file = fileInput.files[0];
  if (!file || file.type !== 'application/pdf') {
    return;
  }
  
  // Create a URL for the file
  const fileURL = URL.createObjectURL(file);
  
  // Set loading state immediately
  Shiny.setInputValue('pdf_loading', true);
  
  // Load PDF.js from CDN if not already loaded
  if (typeof pdfjsLib === 'undefined') {
    // Set worker source (required for PDF.js)
    const script = document.createElement('script');
    script.src = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.min.js';
    script.onload = function() {
      pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';
      processPdf(fileURL);
    };
    document.head.appendChild(script);
  } else {
    processPdf(fileURL);
  }
  
  // Function to process the PDF
  function processPdf(url) {
    pdfjsLib.getDocument(url).promise.then(function(pdf) {
      let textContent = '';
      let pendingPages = pdf.numPages;
      
      // Update progress in UI with total pages info
      Shiny.setInputValue('pdf_progress', { current: 0, total: pendingPages });
      
      // Extract text from each page
      for (let i = 1; i <= pdf.numPages; i++) {
        pdf.getPage(i).then(function(page) {
          page.getTextContent().then(function(content) {
            const strings = content.items.map(item => item.str);
            textContent += strings.join(' ') + '\\n';
            
            // Update progress
            Shiny.setInputValue('pdf_progress', { 
              current: pdf.numPages - pendingPages + 1, 
              total: pdf.numPages 
            });
            
            // Check if all pages are processed
            pendingPages--;
            if (pendingPages === 0) {
              // Send the extracted text back to Shiny
              Shiny.setInputValue('pdf_text', textContent);
              Shiny.setInputValue('pdf_loading', false);
              
              // Clean up
              URL.revokeObjectURL(url);
            }
          });
        });
      }
    }).catch(function(error) {
      console.error('Error loading PDF:', error);
      Shiny.setInputValue('pdf_error', error.message);
      Shiny.setInputValue('pdf_loading', false);
      URL.revokeObjectURL(url);
    });
  }
}

// Initialize PDF handling when document input changes
$(document).on('change', '#document', function(e) {
  // Clear previous values 
  Shiny.setInputValue('pdf_error', null);
  Shiny.setInputValue('pdf_text', null);
  
  const file = this.files[0];
  if (file && file.type === 'application/pdf') {
    // For PDFs, extract text using PDF.js
    extractPdfText(this);
  } else {
    // For non-PDFs, ensure pdf_loading is false so analysis can proceed
    Shiny.setInputValue('pdf_loading', false);
  }
});
"

ui <- page_fluid(
  # Theme with simplified layout
  theme = bs_theme(
    bootswatch = "flatly",
    primary = "#2c3e50",
    "navbar-bg" = "#2c3e50"
  ),
  
  # Include shinyjs
  shinyjs::useShinyjs(),
  
  # Include custom JavaScript and CSS for better layout
  tags$head(
    tags$script(HTML(js_code)),
    tags$style(HTML("
      /* Simplified styles with no nested cards */
      body {
        padding: 15px;
      }
      
      .document-info {
        background-color: #f8f9fa;
        padding: 15px;
        border-radius: 5px;
        font-family: monospace;
        margin-bottom: 20px;
      }
      
      .section-header {
        font-weight: bold;
        font-size: 1.2rem;
        margin-top: 20px;
        margin-bottom: 15px;
        padding-bottom: 5px;
        border-bottom: 1px solid #e9ecef;
      }
      
      /* Make the table more compact and readable */
      .dataTables_wrapper {
        padding: 10px 0;
      }
      
      /* Snarky message styling */
      .snarky-message {
        font-size: 1.1rem;
        padding: 15px;
        margin: 20px 0;
        border-radius: 5px;
        font-weight: bold;
      }
      
      .snarky-warning {
        background-color: #f8d7da;
        color: #721c24;
        border: 1px solid #f5c6cb;
      }
      
      .snarky-success {
        background-color: #d4edda;
        color: #155724;
        border: 1px solid #c3e6cb;
      }
      
      /* Button styling */
      .action-button {
        margin-top: 10px;
      }
      
      /* Layout adjustments */
      .col-sm-4 {
        background-color: #f8f9fa;
        padding: 20px;
        border-radius: 5px;
      }
      
      /* Add space between columns */
      .col-sm-8 {
        padding-left: 30px;
      }
      
      /* Keyword source section styling */
      .keyword-source-section {
        margin-top: 20px;
        padding-top: 15px;
        border-top: 1px solid #e9ecef;
      }
    "))
  ),
  
  # Layout with sidebar and main content in a fluidRow
  fluidRow(
    # Sidebar panel
    column(
      width = 4,
      h4("Document Upload", class = "mb-3"),
      fileInput("document", "Choose Document", 
                accept = c(".docx", ".doc", ".pdf", ".txt", ".csv", ".html", 
                           ".htm", ".rtf", ".xml", ".json", ".xlsx", ".xls")),
      
      # PDF processing status (conditionally shown)
      conditionalPanel(
        condition = "input.pdf_loading == true",
        div(
          class = "alert alert-info",
          "Processing PDF... This may take a moment.",
          tags$div(
            class = "progress mt-2",
            tags$div(
              id = "pdf-progress-bar",
              class = "progress-bar progress-bar-striped progress-bar-animated",
              role = "progressbar",
              style = "width: 0%"
            )
          )
        )
      ),
      
      # Keyword source selection section
      div(
        class = "keyword-source-section",
        h4("Keyword Source", class = "mb-3"),
        radioButtons("keyword_source", "Select Keyword Source:",
                    choices = list(
                      "Default" = "github",
                      "Upload Custom Keywords" = "custom"
                    ),
                    selected = "github"),
        
        # Custom keywords file upload (conditionally shown)
        conditionalPanel(
          condition = "input.keyword_source == 'custom'",
          fileInput("custom_keywords", "Upload Keywords CSV",
                   accept = c(".csv")),
          tags$p(class = "text-muted", "CSV file should have a header row with 'words' column containing keywords.")
        )
      ),
      
      # Added analyze button for manual analysis triggering
      actionButton("analyze_btn", "Analyze Document", 
                   class = "btn-primary btn-block mt-3"),
      
      hr(),
      
      tags$p("Supported formats: Word (.docx, .doc), PDF, Text, CSV, HTML, XML, JSON, Excel (.xlsx, .xls), RTF"),
      tags$p(class = "text-muted", "Click 'Analyze Document' after uploading to begin analysis.")
    ),
    
    # Main content
    column(
      width = 8,
      # Status message
      uiOutput("status_message"),
      
      # Document information section
      conditionalPanel(
        condition = "output.document_analyzed == true",
        
        h3("Analysis Results", class = "section-header"),
        
        # Document basic info with better styling
        h4("Document Information", class = "section-header"),
        div(
          class = "document-info",
          verbatimTextOutput("document_info")
        ),
        
        # Snarky message output
        htmlOutput("snarky_message"),
        
        # Keywords table - only shown if keywords are found
        conditionalPanel(
          condition = "output.has_keywords == true",
          h4("Keywords Found", class = "section-header"),
          DT::dataTableOutput("keyword_table")
        )
      )
    )
  )
)

server <- function(input, output, session) {
  
  # JavaScript to update progress bar
  observe({
    if (!is.null(input$pdf_progress)) {
      progress <- input$pdf_progress
      percentage <- round((progress$current / progress$total) * 100)
      shinyjs::runjs(sprintf(
        "$('#pdf-progress-bar').css('width', '%s%%').attr('aria-valuenow', %s)",
        percentage, percentage
      ))
    }
  })
  
  # Reactive values to store analysis results
  results <- reactiveVal(NULL)
  
  # Store PDF text when it becomes available
  pdf_text <- reactiveVal(NULL)
  observe({
    if (!is.null(input$pdf_text)) {
      pdf_text(input$pdf_text)
    }
  })
  
  # Reactive value to store current keywords
  current_keywords <- reactiveVal(NULL)
  
  # Initialize keywords from GitHub when app starts
  observe({
    # Only load if current_keywords is NULL (first load)
    if (is.null(current_keywords())) {
      github_keywords <- load_github_keywords()
      current_keywords(github_keywords)
    }
  })
  
  # Update keywords when source changes or custom file is uploaded
  observe({
    # Reset when source changes
    if (input$keyword_source == "github") {
      github_keywords <- load_github_keywords()
      current_keywords(github_keywords)
    } else if (input$keyword_source == "custom" && !is.null(input$custom_keywords)) {
      # Read custom keywords from uploaded file
      tryCatch({
        custom_file <- input$custom_keywords
        custom_keywords_df <- read.csv(custom_file$datapath, stringsAsFactors = FALSE)
        
        # Extract keywords from the dataframe
        if ("words" %in% colnames(custom_keywords_df)) {
          custom_keywords <- custom_keywords_df$words
        } else {
          # If column name is different, take the first column
          custom_keywords <- custom_keywords_df[[1]]
        }
        
        # Update current keywords
        current_keywords(custom_keywords)
      }, error = function(e) {
        # If error occurs, show a notification and use GitHub keywords
        showNotification(
          paste("Error reading custom keywords:", e$message, "Using default keywords instead."),
          type = "error",
          duration = 10
        )
        github_keywords <- load_github_keywords()
        current_keywords(github_keywords)
      })
    }
  })
  
  # Output indicator for whether document has been analyzed
  output$document_analyzed <- reactive({
    !is.null(results())
  })
  outputOptions(output, "document_analyzed", suspendWhenHidden = FALSE)
  
  # Output indicator for whether keywords were found
  output$has_keywords <- reactive({
    !is.null(results()) && length(results()$keywords_found) > 0
  })
  outputOptions(output, "has_keywords", suspendWhenHidden = FALSE)
  
  # Status message - updated for button-triggered analysis
  output$status_message <- renderUI({
    if (!is.null(input$pdf_error)) {
      return(div(class = "alert alert-danger", 
                 "PDF Error: ", input$pdf_error))
    }
    
    if (!is.null(input$pdf_loading) && input$pdf_loading) {
      return(div(class = "alert alert-info", 
                 "Processing PDF... Please wait."))
    }
    
    if (is.null(results())) {
      if (is.null(input$document)) {
        return(div(class = "alert alert-info", 
                   "Please upload a document and click 'Analyze Document' to begin."))
      } else {
        return(div(class = "alert alert-info", 
                   "Document uploaded. Click 'Analyze Document' to begin analysis."))
      }
    } else if (!is.null(results()$error)) {
      return(div(class = "alert alert-danger", 
                 "Error: ", results()$error))
    } else {
      return(div(class = "alert alert-success", 
                 "Analysis complete!"))
    }
  })
  
  # Function to extract text from document with improved file type support
  extract_text <- function(file_path) {
    # Check if we have PDF text from JavaScript
    if (!is.null(input$document) && 
        tolower(tools::file_ext(input$document$name)) == "pdf" && 
        !is.null(pdf_text())) {
      return(pdf_text())
    }
    
    # Get file extension
    file_ext <- tolower(tools::file_ext(file_path))
    
    # Handle each file type appropriately
    tryCatch({
      if (file_ext == "txt") {
        # Plain text files
        text <- readLines(file_path, warn = FALSE)
        return(paste(text, collapse = "\n"))
        
      } else if (file_ext %in% c("csv")) {
        # CSV files - improved handling
        df <- read.csv(file_path, stringsAsFactors = FALSE)
        # Convert all columns to character for better text extraction
        df[] <- lapply(df, as.character)
        # Combine all cells into a single text string
        text <- paste(unlist(df), collapse = " ")
        return(text)
        
      } else if (file_ext %in% c("xlsx", "xls")) {
        # Excel files
        sheets <- readxl::excel_sheets(file_path)
        all_text <- character(0)
        
        for (sheet in sheets) {
          df <- readxl::read_excel(file_path, sheet = sheet)
          # Convert to character
          df[] <- lapply(df, as.character)
          # Add sheet content
          all_text <- c(all_text, paste("Sheet:", sheet))
          all_text <- c(all_text, paste(unlist(df), collapse = " "))
        }
        
        return(paste(all_text, collapse = "\n"))
        
      } else if (file_ext == "json") {
        # JSON files
        json_data <- jsonlite::fromJSON(file_path)
        # Recursively extract all values from JSON
        extract_values <- function(obj) {
          if (is.list(obj)) {
            values <- unlist(lapply(obj, extract_values))
            return(paste(values, collapse = " "))
          } else if (is.data.frame(obj)) {
            # Convert data frame to character
            obj[] <- lapply(obj, as.character)
            return(paste(unlist(obj), collapse = " "))
          } else if (is.vector(obj) && !is.character(obj)) {
            return(paste(obj, collapse = " "))
          } else {
            return(obj)
          }
        }
        
        text <- extract_values(json_data)
        return(text)
        
      } else if (file_ext == "xml") {
        # XML files
        xml_data <- xml2::read_xml(file_path)
        # Extract all text content from XML nodes
        nodes <- xml2::xml_find_all(xml_data, "//text()")
        text <- xml2::xml_text(nodes)
        return(paste(text, collapse = " "))
        
      } else if (file_ext %in% c("html", "htm")) {
        # HTML files - improved handling
        html_content <- xml2::read_html(file_path)
        nodes <- xml2::xml_find_all(html_content, "//text()")
        text <- xml2::xml_text(nodes)
        return(paste(text, collapse = " "))
        
      } else {
        # Use readtext as a fallback for other formats (docx, rtf, etc.)
        text_data <- readtext::readtext(file_path)
        return(text_data$text)
      }
    }, error = function(e) {
      # Log the error
      warning(paste("Error extracting text from document:", e$message))
      return("")
    })
  }
  
  # Function to find keywords in text
  find_keywords <- function(text, keywords) {
    text_lower <- tolower(text)
    found_keywords <- character(0)
    keyword_counts <- numeric(0)
    
    for (keyword in keywords) {
      # Use word boundaries to match whole words
      pattern <- paste0("\\b", tolower(keyword), "\\b")
      matches <- str_count(text_lower, pattern)
      
      if (matches > 0) {
        found_keywords <- c(found_keywords, keyword)
        keyword_counts <- c(keyword_counts, matches)
      }
    }
    
    # Create a named vector of counts
    names(keyword_counts) <- found_keywords
    
    return(list(
      keywords = found_keywords,
      counts = keyword_counts
    ))
  }
  
  # Analyze document when button is clicked
  observeEvent(input$analyze_btn, {
    # Ensure a document is uploaded
    if (is.null(input$document)) {
      results(list(error = "Please upload a document first."))
      return()
    }
    
    # Check if PDF is still loading
    if (!is.null(input$pdf_loading) && input$pdf_loading) {
      results(list(error = "PDF is still processing. Please wait."))
      return()
    }
    
    # Check for PDF errors
    if (!is.null(input$pdf_error)) {
      results(list(error = paste("PDF Error:", input$pdf_error)))
      return()
    }
    
    # Check if we have keywords
    if (is.null(current_keywords()) || length(current_keywords()) == 0) {
      results(list(error = "No keywords available for analysis. Please check keyword source."))
      return()
    }
    
    # Extract document info
    doc_path <- input$document$datapath
    doc_name <- input$document$name
    file_type <- tolower(tools::file_ext(doc_name))
    
    # Show progress indicator for large files
    withProgress(message = 'Analyzing document...', value = 0.2, {
      # Extract text from document
      text <- extract_text(doc_path)
      
      if (text == "") {
        results(list(error = "Failed to extract text from the document. The file format may not be fully supported."))
        return()
      }
      
      setProgress(value = 0.5, detail = "Searching for keywords...")
      
      # Find keywords in the text using current keywords
      keyword_results <- find_keywords(text, current_keywords())
      
      # Calculate word and character counts
      words <- unlist(strsplit(text, "\\s+"))
      words <- words[words != ""] # Remove empty strings
      
      setProgress(value = 0.9, detail = "Finalizing results...")
      
      # Store the results
      results(list(
        document_name = doc_name,
        file_type = file_type,
        text = text,
        total_words = length(words),
        total_chars = nchar(text),
        keywords_found = keyword_results$keywords,
        keyword_counts = keyword_results$counts,
        keyword_source = input$keyword_source,
        error = NULL
      ))
    })
  })
  
  # Reset PDF text when new file is uploaded
  observeEvent(input$document, {
    pdf_text(NULL)
    # Do NOT reset results automatically when a new file is uploaded
    # This allows users to explicitly trigger analysis with the button
  })
  
  # Output: Document info - simplified to just the basics
  output$document_info <- renderText({
    if (is.null(results())) return("")
    
    # Add keyword source information
    keyword_source_info <- if (results()$keyword_source == "github") {
      "GitHub Repository"
    } else {
      "Custom Upload"
    }
    
    paste0(
      "File Name: ", results()$document_name, "\n",
      "File Type: ", toupper(results()$file_type), "\n",
      "Word Count: ", format(results()$total_words, big.mark = ","), "\n",
      "Keyword Source: ", keyword_source_info, "\n",
      "Analysis Date: ", format(Sys.time(), "%Y-%m-%d %H:%M:%S")
    )
  })
  
  # Snarky messages - choose randomly from a set for each case
  output$snarky_message <- renderUI({
    if (is.null(results())) return(NULL)

  # Snarky messages for when keywords are found
    warning_messages <- c(
      "Uh oh, looks like your document has some words on the naughty list - no first amendment rights for you!",
      "ALERT! Your document contains language that may cause conservative fainting spells!",
      "WARNING: Potentially woke content detected! Hide your children!",
      "Potentially woke language detected! Your document might cause pearl-clutching!",
      "Snowflake trigger warning: Your document contains words that might make Tucker Carlson cry!"
    )
    
    # Snarky messages for when no keywords are found
    success_messages <- c(
      "Congratulations! Your document is free of scary words like \"gender\" that trigger the MAGA mind.",
      "Good news! Nothing in your document will upset Fox News viewers!",
      "You're safe! No words that might cause conservative heart palpitations detected.",
      "Document approved for Florida schools! No scary inclusive language found!", 
      "Phew! Your document is officially woke-free!"
    )

    if (length(results()$keywords_found) > 0) {
      # Keywords found - show warning message
      total_keywords <- sum(results()$keyword_counts)
      unique_keywords <- length(results()$keywords_found)
      
      div(
        class = "snarky-message snarky-warning",
        sample(warning_messages, 1),
        tags$br(),
        tags$span(
          style = "font-size: 0.9rem; font-weight: normal;",
          paste0("Found ", total_keywords, " occurrences of ", 
                unique_keywords, " unique keywords")
        )
      )
    } else {
      # No keywords found - show success message
      div(
        class = "snarky-message snarky-success",
        sample(success_messages, 1)
      )
    }
  })
  
  # Output: Enhanced keyword table
  output$keyword_table <- DT::renderDataTable({
    if (is.null(results()) || length(results()$keywords_found) == 0) return(NULL)
    
    df <- data.frame(
      Keyword = results()$keywords_found,
      Occurrences = results()$keyword_counts,
      stringsAsFactors = FALSE
    )
    
    # Sort by number of occurrences (descending)
    df <- df[order(-df$Occurrences), ]
    
    DT::datatable(
      df,
      options = list(
        pageLength = 15,
        order = list(list(1, 'desc')),
        dom = 'tip'  # table, information, and pagination (no search)
      ),
      rownames = FALSE
    ) %>%
      DT::formatStyle(
        'Keyword',
        fontWeight = 'bold'
      ) %>%
      DT::formatStyle(
        'Occurrences',
        background = DT::styleColorBar(range(df$Occurrences), '#9ecae1'),
        fontWeight = 'bold'
      )
  })
}

# Run the application
shinyApp(ui = ui, server = server)