#!/bin/bash
#
# File: check-pdf-errors
# Version: 0.98b
#
# Checks whether the PDF files in the current directory have issues in matching the CEURART style.
# (C) 2024-2025 by Manfred Jeusfeld. This script is made available under the
# Creative Commons Attribution-ShareAlike CC-BY-SA 4.0 license.
#
# The BASH script is part of the scripts used for CEUR-WS.org. No warrantee whatsoever. No support. 
#
# Requires the installation of certain packages, in particular pdf2txt, pdffonts, and perl. The tool pdf2txt is part of the 
# package python-pdfminer. The pdffonts command is part of the package poppler-utils. 
# Install on Debian-based systems with
#   sudo apt install python3-pdfminer
#   sudo apt-get install poppler-utils
#   sudo apt-get install perl
#   sudo apt-get install qpdf
#
# On Mac OS (no guaratee that it works)
#   python3-pdfminer (pdfminer.six via pip3)
#   poppler-utils (via Homebrew)
#
# Note that this script is updated on a regular basis, in particular to cover changes with
# the CEURART layout for papers. See ceur-ws.org/Vol-XXX/ for the CEURART specification.
#
# Call this script in the directory that contains the PDF files that you want to check.
#
# You can also use a parameter to specify which test shall be executed, e.g.
#   check-pdf-errors libertinus
#   check-pdf-errors "genai duplicates"
# A call without a parameter or with parameter "all" will execute all tests
#
# Manfred 2024-08-17 (2026-03-06)
#

# Define a function specifying which PDF files do NOT need to be tested
isExcludedFile() {
    local filename="$1"
    
    case "$filename" in
        *"reface"*".pdf" |  *"rganization.pdf" | *"ponsors.pdf" | *"bstract"*".pdf" | *"ommittee"*".pdf" | *"matter"*".pdf" | *"ditorial.pdf")
            return 0  # "True" in bash exit codes
            ;;
        *)
            return 1  # "False"
            ;;
    esac
}


if [[ ! `ls *.pdf` ]]; then
    echo "No file with filetype *.pdf in this directory."
    exit 1
fi

echo "check-pdf-errors V0.98b (2026-03-06) CC-BY-SA 4.0"

if [[ "$1" == "" || "$1" == "all" ]] ; then
   TOBETESTED="readable copyright genai logo libertinus duplicates leftover columns titles executable pagecount" 
else
   TOBETESTED=$1
fi


# ===============
# Test "readable"
# ===============
if [[ "$TOBETESTED" == *"readable"* ]] ; then

echo ""
echo "(*) Readable/Selectable text inside PDF files"
ABSTRACTWORD="found"
for f in *.pdf
do
  echo -en "\rChecking $f               "
  # PDF files should have the computer-readable string "Abstract" or "Copyright" on the first two pages (normally page 1)
  pdf2txt -m 2 "$f" | grep -E 'Abstract|Copyright' 2>&1 > /dev/null
  if [[ $? != 0 ]];
  then
     if [[ "$f" != *"reface"*".pdf" && "$f" != *"rganization.pdf" && "$f" != *"ponsors.pdf" && "$f" != *"ommittee"*".pdf" && "$f" != *"matter"*".pdf" ]]
     then
        ABSTRACTWORD="notfound"
        echo -e "\rPDF file $f seems to have no readable text included but only binary data"
     fi
  fi
done
if [[ "$ABSTRACTWORD" == "notfound" ]] ; then
  echo -e "\r ===> Make sure that paper PDFs have readable/selectable text in them; use a proper PDF printer driver on Windows such as PDFCreator or use LibreOffice PDF export"
  echo " "
else
  echo -e "\rok                                                                         "
  echo " "
fi

fi


# ================
# Test "copyright"
# ================
if [[ "$TOBETESTED" == *"copyright"* ]] ; then

echo ""
echo "(*) CEUR-WS standard copyright phrase"
CREATIVECOMMONS="found"
for f in *.pdf
do
  echo -en "\rChecking $f                  "

  if isExcludedFile "$f" ; then
     continue
  fi

  # PDF files should have the string "Creative Commons License" on the first two pages; copyright year may not be 2022 or 2023 
  # left unmodified by the author when using the CEURART template
#  pdf2txt -m 2 "$f" | grep -E 'Creative.*Commons.*License|Commons.*License.*Attribution' 2>&1 > /dev/null
  pdf2txt -m 2 "$f" | grep -E 'Creative.*Commons.*License|Commons.*License.*Attribution' | grep -Ev '2022|2023' 2>&1 > /dev/null
  if [[ $? != 0 ]];
  then
     CREATIVECOMMONS="notfound"
     echo -e "\rPDF file $f seems to lack the proper copyright clause or copyright year on page 1"

  fi
done
if [[ $CREATIVECOMMONS == "notfound" ]] ; then
  echo -e "\r ===> Make sure that paper PDFs have the correct copyright clause, see https://ceur-ws.org/HOWTOSUBMIT.html#CCBY-FOOTNOTE"
  echo " ===> Make sure that paper PDFs have the correct *year* in the copyright clause!"
  echo " "
else
  echo -e "\rok                                                                        "
  echo " "
fi

fi


# ============
# Test "genai"
# ============
if [[ "$TOBETESTED" == *"genai"* ]] ; then

echo ""
echo "(*) Declaration on Generative AI"
GENAIDECL="ok"
for f in *.pdf
do
  echo -en "\rChecking $f             "

  if isExcludedFile "$f" ; then
     continue
  fi

  temp_file=$(mktemp /tmp/pdftext.XXXXXX.txt)
#  pdftotext "$f" "$temp_file"
  pdf2txt "$f" > "$temp_file"
  # Test 1: PDF files should have a section "Declaration on Generative AI"
  # cat "$temp_file" | grep -E 'Declaration.* [G|g]enerative AI|[G|g]enerative AI *[D|d]eclaration|Declaration .. GenAI' 2>&1 > /dev/null
  cat "$temp_file" | grep -i -F 'declaration' | grep -E '[Gg]enerative\s+AI|[Gg]enAI' 2>&1 > /dev/null
  if [[ $? != 0 ]];
  then
     GENAIDECL="notok"
     echo -e "\rPDF file $f seems to lack a section Declaration on Generative AI"
     continue;
  fi
  # Test 2: The phrase "The author ... not employed any Generative AI tools." is fine; not more checks needed
  cat "$temp_file" | grep -E 'The author.*not employed any Generative AI tools\.' 2>&1 > /dev/null
  if [[ $? == 0 ]];
  then
    continue;  # This file is ok
  fi
  # Test 3: Test on fake AI tool names from our template
  cat "$temp_file" | grep -E 'X-GPT-.|Gramby|X-AI-IMG' 2>&1 > /dev/null
  if [[ $? == 0 ]];
  then
     GENAIDECL="notok"
     echo -e "\rPDF file $f seems to mention non existing tools X-GPT-.|Gramby|X-AI-IMG in its Declaration on Generative AI"
     continue;
  fi

  # If Test2 failed then Test 4 is required: The GenAI statement must contain the phrase "reviewed|*full responsibility"
  # Not yet happy with this solution. We want authors to follow our template but some make slight variations. Others big ones.
  # cat "$temp_file" | tr '\n' ' ' | grep -E 'reviewed|full responsibility' 2>&1 > /dev/null
  #  if [[ $? != 0 ]];
  #then
  #   GENAIDECL="notok"
  #   echo "PDF file $f seems to use a non-standard text for the Declaration on Generative AI"
  #fi
  /bin/rm "$temp_file"
done
if [[ $GENAIDECL == "notok" ]] ; then
  echo -e "\r ===> Make sure that paper PDFs have a Generative AI Declaration conforming https://ceur-ws.org/GenAI/Policy.html"
  echo " "
else
  echo -e "\rok                                                                   "
  echo " "
fi

fi


# ===========
# Test "logo"
# ===========
if [[ "$TOBETESTED" == *"logo"* ]] ; then

echo ""
echo "(*) Use of CEUR-WS.org logo or link or string CEUR Workshop Proceedings before publication"
CEURWSLINKFOUND="notfound"
for f in *.pdf
do
  echo -en "\rChecking $f           "
  if [ -f watermark-log.txt ]; then
    if grep -q "$f" watermark-log.txt; then
      continue;
    fi
  fi

  if isExcludedFile "$f" ; then
     continue
  fi

  # PDF files should not have the string "(CEUR-WS.org)" on the first two pages
  pdf2txt -m 2 "$f" | grep -E '(CEUR-WS.org)|CEUR Workshop Proceedings' 2>&1 > /dev/null
  if [[ $? == 0 ]];
  then
     CEURWSLINKFOUND="found"
     echo -e "\rPDF file $f uses 'CEUR-WS.org' or 'CEUR Workshop Proceedings' before publication"
  fi
done
if [[ $CEURWSLINKFOUND == "found" ]] ; then
  echo -e "\r ===> Use the latest CEURART template from https://ceur-ws.org/Vol-XXX/, which does not have a footnote with the CEUR-WS logo; Do not use the string 'CEUR Workshop Proceedings' in the header or footer of papers"
  echo " "
else
  echo -e "\rok                                                                      "
  echo " "
fi

fi


# =================
# Test "libertinus"
# =================
if [[ "$TOBETESTED" == *"libertinus"* ]] ; then

echo ""
echo "(*) Libertinus font in paper PDFs"
NONLIBERTINUSFOUND="no"

if [[ ! -f $HOME/bin/check-libbyhead.py ]]; then
   echo " --> Extra test routine check-libbyhead.py missing. Result of this test is therefore incomplete."
fi

for f in *.pdf
do
  echo -en "\rChecking $f                        "

  if isExcludedFile "$f" ; then
     continue
  fi

  # 2025-12-15: We inspect the actual use of Libertinus fonts on page 1 for the headings
  # This is not a complete test. So, we may have to extend it to body text as well
  # The restriction to page 1 is for efficiency of the test

if [[ -f $HOME/bin/check-libbyhead.py ]]; then
    python3 $HOME/bin/check-libbyhead.py "$f"
    exit_code=$?
    if [ $exit_code -eq 1 ]; then
        NONLIBERTINUSFOUND="found"
        echo -e "\rPDF file $f seems not use Libertinus Sans font for headings"
    elif [ $exit_code -eq 2 ]; then
        NONLIBERTINUSFOUND="found"
        echo -e "\rPDF file $f seems not use Libertinus Serif font for body text"
    elif [ $exit_code -eq 3 ]; then
        NONLIBERTINUSFOUND="found"
        echo -e "\rPDF file $f seems not use Libertinus Serif font for body text and Libertinus Sans font for headings"
        continue
    elif [ $exit_code -ne 0 ]; then
        echo -e "\rUnexpected error with PDF file $f"
    fi
fi

  # Files created with Windows Word/LibreOfffice apparently use CIDFont+F instead Libertinus as font name (?)
  # We use two methods for Libertinus detection. First, 'strings | grep FontName' extracts it if it is used as a PDF
  # command. Second, pdffonts is used when the fonts are embedded differently. 
  strings "$f" | grep FontName | grep -q -E 'Libertinus|CIDFont.F5'
  if [[ $? != 0 ]];
  then
       pdffonts 2> /dev/null $f | grep -q -E 'Libertinus|CIDFont.F5'
       if [[ $? != 0 ]] ; then
          NONLIBERTINUSFOUND="found"
          echo -e "\rPDF file $f does not use Libertinus font family"
       fi
  fi
done
if [[ $NONLIBERTINUSFOUND == "found" ]] ; then
  echo -e "\r ===> Make sure that paper PDFs use the Libertinus font family; instructions to install Libertinus fonts are included in https://ceur-ws.org/Vol-XXX/CEUR-Template-1col.odt; paper PDFs should all use Libertinus fonts, prefaces that have no paper character do not necessarily need to use Libertinus; do not use Word/MS365 but rather use LibreOffice and its PDF exporter"
  echo " "
else
  echo -e "\rok                                                                                    "
  echo " "
fi

fi


# =================
# Test "duplicates"
# =================
if [[ "$TOBETESTED" == *"duplicates"* ]] ; then

echo ""
echo "(*) Duplicate PDF files"
DUPPDFSFILES=`find . -name '*.pdf' ! -empty -type f -exec md5sum {} + | sort | uniq -w32 -dD`
if [ "$DUPPDFSFILES" != "" ]
then echo -e "\r ";
     echo " ==========> ERROR (P2) with duplicate PDF files!!!!"
     find . -name '*.pdf' ! -empty -type f -exec md5sum {} + | sort | uniq -w32 -dD
  echo " "
else
  echo -e "\rok                                                                                      "
  echo " "

fi

fi


# ===============
# Test "leftover"
# ===============
if [[ "$TOBETESTED" == *"leftover"* ]] ; then

echo ""
echo "(*) Leftover elements from CEURART in the footer of page 1"
NOLEFTOVER="yes"
for f in *.pdf
do
  echo -en "\rChecking $f                "
  # PDF files should not have leftover elements from the CEURART template in the footer
  pdf2txt -m 2 "$f" | grep -E '0000-0000-0000-0000|Woodstock.*22|0000-0002-0877-7063.*0000-0001-7116-9338.*0000-0002-9421-8566'  2>&1 > /dev/null
  if [[ $? == 0 ]];
  then
     NOLEFTOVER="no"
     echo -e "\rPDF file $f seems to have some leftover elements from the footer of the CEURART template on page 1 such as ORCIDs or the event name"
  fi
done
if [[ $NOLEFTOVER == "no" ]] ; then
  echo -e "\r ===> Make sure that paper PDFs have correct data in the footnote section of page 1"
  echo " "
else
  echo -e "\rok                                                                   "
  echo " "
fi

fi


# ==============
# Test "columns"
# ==============
if [[ "$TOBETESTED" == *"columns"* ]] ; then

if [[ ! -x $HOME/bin/check-columns ]]; then
    echo " "
    echo "(*) Check manually if all paper PDFs are in CEURART one-column style"
    echo " "
else
  # A better test on one-column 
  echo ""
  echo "(*) Check on one-column style"
  TWOCOLUMNFOUND="no"
  for f in *.pdf
  do

  if isExcludedFile "$f" ; then
     continue
  fi

  echo -en "\rChecking $f           "
  $HOME/bin/check-columns $f
  # previous command returns error code 1 if a file is presumably in two-column mode
  if [ $? -ne 0 ]; then
    TWOCOLUMNFOUND="yes"
    echo -e "\rPDF file $f seems to be in two column format"
  fi

  done

  if [[ "$TWOCOLUMNFOUND" == "yes" ]] ; then
    echo -e "\r ===> Make sure that paper PDFs are in one-column CEURART format, see https://ceur-ws.org/Vol-XXX/"
    echo " "
  else
    echo -e "\rok                                                                         "
    echo " "
  fi
fi 

fi


# ================================================================
# Test "titles" and "authors" in index.html vs. the paper PDF file
# ================================================================
#
# 2026-02-15: pretty complete but does not find errors when the PDF file has more authors
# than the corresponding entry in index.html
#

if [[ "$TOBETESTED" == *"titles"* ]] ; then

echo ""
echo "(*) Check whether the paper titles and authors in index.html match the PDF file"

# We use a generic MISMATCH variable to capture any issues in this block
MISMATCH_FOUND="no"

# Capture Perl output directly
perl_output=$(perl -Mopen=locale -0777 -ne '
use strict;
use warnings;

my $any_mismatch = 0;

# Subroutine to aggressively normalize text (lowercase, no punctuation, no spaces)
sub normalize_text {
    my ($text) = @_;
    return "" unless defined($text);
    $text =~ s/[\n\t\r]+/ /g; 
    $text =~ s/\s+/ /g;       
    $text = lc($text);
    $text =~ s/[\*\x{22c6}]//g; # Remove standard * and Unicode ⋆
    $text =~ s/[^a-z0-9]/ /g;  
    $text =~ s/\s+//g; # Remove ALL spaces for a condensed comparison
    return $text;
}

while (/<li[^>]*>(.*?)<\/li>/sg) {
    my $li = $1;
    my $pdf;
    my $title;
    my @authors;

    # Extract PDF filename
    if ($li =~ /<a href="([^"]+\.pdf)"/s) {
        $pdf = $1;
        $pdf =~ s/.*\///; # Basename only

        # Extract Title from index.html
        if ($li =~ /<span class="CEURTITLE">(.*?)<\/span>/s) {
            $title = $1;
        } elsif ($li =~ /<a [^>]*class="CEURTITLE">(.*?)<\/a>/s) {
            $title = $1;
        }

        # Extract Authors from index.html
        while ($li =~ /<span class="CEURAUTHOR">(.*?)<\/span>/sg) {
            my $auth = $1;
            $auth =~ s/\s+/ /g;
            $auth =~ s/ \*$//; # Remove trailing asterisk if not caught by normalization
            push @authors, $auth;
        }

        # Check if PDF file exists locally
        if (!-f $pdf) {
            print "Error: File $pdf missing\n";
            $any_mismatch = 1;
            next;
        }

        # PDF text extraction (first page only)
        my $page1 = `pdftotext -q -f 1 -l 1 "$pdf" - 2>/dev/null`;
        my $normpage = normalize_text($page1);

        # --- 1. TITLE CHECK ---
        if ($title) {
            my $normtitle = normalize_text($title);
            my $escaped_t = quotemeta($normtitle);
            if ($normpage !~ /$escaped_t/) {
                print "$pdf: Expected CEURTITLE \"$title\" not found on first page\n";
                $any_mismatch = 1;
            }
        }

        # --- 2. AUTHOR CHECK (Presence and Sequence) ---
        my %author_positions;
        my $all_authors_present = 1;

        foreach my $author (@authors) {
            my $norm_auth = normalize_text($author);
            my $escaped_a = quotemeta($norm_auth);

            # Find absolute position on page
            if ($normpage =~ /($escaped_a)/) {
                # $-[0] is the start index of the match in the string
                $author_positions{$author} = $-[0];
            } else {
                print "$pdf: Expected CEURAUTHOR \"$author\" not found on first page\n";
                $all_authors_present = 0;
                $any_mismatch = 1;
            }
        }

        # Only check order if everyone was actually found
        if ($all_authors_present && scalar @authors > 1) {
            for (my $i = 0; $i < $#authors; $i++) {
                my $curr = $authors[$i];
                my $next = $authors[$i+1];

                if ($author_positions{$curr} > $author_positions{$next}) {
                    print "$pdf: Author sequence error. \"$curr\" must appear before \"$next\".\n";
                    $any_mismatch = 1;
                }
            }
        }
    }
}

# Final flag for bash
if ($any_mismatch) {
    print "__MISMATCH__=yes\n";
} else {
    print "__MISMATCH__=no\n";
}
' index.html)

# Parse the Perl output to set the Bash variable
while IFS= read -r line; do
    if [[ "$line" == __MISMATCH__=* ]]; then
        MISMATCH_FOUND="${line#__MISMATCH__=}"
    else
        # Echo the specific error findings (e.g. "missing author")
        echo "$line"
    fi
done <<< "$perl_output"


if [[ "$MISMATCH_FOUND" == "yes" ]] ; then
  echo " ===> Make sure that paper CEURTITLE and CEURAUTHORs in index.html match the PDF!"
  echo "      Check both the text content and the order of authors. Accents do matter!"
  echo " "
else
  echo "ok"
  echo " "
fi

fi


# =================
# Test "executable"
# =================
if [[ "$TOBETESTED" == *"executable"* ]] ;
then

# This test is quite shaky and can easily be defeated by malicious attackers by obfuscating the JS code
echo ""
echo "(*) Searching for executable code (JS/Launch/OpenAction) in ALL PDF files"
EXEC_FOUND="no"

for f in *.pdf
do
  echo -en "\rChecking $f                        "

  # 1. Use strings to find the raw PDF tags.
  # 2. Search for JavaScript, Launch, or OpenActions that lead to a specific Service (/S).
  # 3. This excludes the "Initial View" (/XYZ) actions found for example in our templates.
  
  MATCH=$(strings "$f" | grep -Ei "/S\s*/JavaScript|/JS\s+|/Launch|/OpenAction\s*/S" | \
          grep -vE "Font|Descriptor|Encoding|Widths|ProcSet|Subset" | \
          grep -vE "/JS-|Libertinus|CIDFont" | head -n 2)
#  MATCH=$(strings "$f" | grep -Ei "/JS|/JavaScript|/OpenAction|/Launch|/#4a#53|/#4c#61#75|/#4f#70#65" | head -n 2)
  
  if [[ -n "$MATCH" ]];
  then
     EXEC_FOUND="yes"
     echo -e "\rPDF file $f contains active content near:"
     # Display the first 2 lines of the match found
     echo "$MATCH" | head -n 2 | sed 's/^/  - /'
  fi
done

if [[ $EXEC_FOUND == "yes" ]] ;
then
  echo ""
  echo " ===> WARNING: One or more PDFs contain active content (JavaScript or Launch commands)."
  echo "      CEUR-WS papers should generally be static documents without embedded logic."
  echo " "
else
  echo -e "\rok                                                                   "
  echo " "
fi

fi


# =================
# Test "pagecount"
# =================
if [[ "$TOBETESTED" == *"pagecount"* ]] ;
then

echo ""
echo "(*) Checking for PDFs with less than 5 pages"
SHORT_PAPERS="no"

for f in *.pdf
do

  if isExcludedFile "$f" ;
  then
     continue
  fi

  echo -en "\rChecking $f                        "

  # Extract the page count using pdfinfo
  PAGES=$(pdfinfo "$f" 2>/dev/null | grep '^Pages:' | awk '{print $2}')

  if [[ -n "$PAGES" && "$PAGES" -lt 5 ]];
  then
     SHORT_PAPERS="yes"
     echo -e "\rPDF file $f has only $PAGES pages."
  fi
done

if [[ "$SHORT_PAPERS" == "yes" ]] ;
then
  echo -e "\r                                                                 "
  echo " ===> WARNING: One or more papers have less than 5 pages."
  echo "      CEUR-WS papers typically require a minimum length; please verify."
  echo " "
else
  echo -e "\rok                                                                   "
  echo " "
fi

fi