#!/bin/bash # # File: check-pdf-errors # Version: 0.98b # # Checks whether the PDF files in the current directory have issues in matching the CEURART style. # (C) 2024-2025 by Manfred Jeusfeld. This script is made available under the # Creative Commons Attribution-ShareAlike CC-BY-SA 4.0 license. # # The BASH script is part of the scripts used for CEUR-WS.org. No warrantee whatsoever. No support. # # Requires the installation of certain packages, in particular pdf2txt, pdffonts, and perl. The tool pdf2txt is part of the # package python-pdfminer. The pdffonts command is part of the package poppler-utils. # Install on Debian-based systems with # sudo apt install python3-pdfminer # sudo apt-get install poppler-utils # sudo apt-get install perl # sudo apt-get install qpdf # # On Mac OS (no guaratee that it works) # python3-pdfminer (pdfminer.six via pip3) # poppler-utils (via Homebrew) # # Note that this script is updated on a regular basis, in particular to cover changes with # the CEURART layout for papers. See ceur-ws.org/Vol-XXX/ for the CEURART specification. # # Call this script in the directory that contains the PDF files that you want to check. # # You can also use a parameter to specify which test shall be executed, e.g. # check-pdf-errors libertinus # check-pdf-errors "genai duplicates" # A call without a parameter or with parameter "all" will execute all tests # # Manfred 2024-08-17 (2026-03-06) # # Define a function specifying which PDF files do NOT need to be tested isExcludedFile() { local filename="$1" case "$filename" in *"reface"*".pdf" | *"rganization.pdf" | *"ponsors.pdf" | *"bstract"*".pdf" | *"ommittee"*".pdf" | *"matter"*".pdf" | *"ditorial.pdf") return 0 # "True" in bash exit codes ;; *) return 1 # "False" ;; esac } if [[ ! `ls *.pdf` ]]; then echo "No file with filetype *.pdf in this directory." exit 1 fi echo "check-pdf-errors V0.98b (2026-03-06) CC-BY-SA 4.0" if [[ "$1" == "" || "$1" == "all" ]] ; then TOBETESTED="readable copyright genai logo libertinus duplicates leftover columns titles executable" else TOBETESTED=$1 fi # =============== # Test "readable" # =============== if [[ "$TOBETESTED" == *"readable"* ]] ; then echo "" echo "(*) Readable/Selectable text inside PDF files" ABSTRACTWORD="found" for f in *.pdf do echo -en "\rChecking $f " # PDF files should have the computer-readable string "Abstract" or "Copyright" on the first two pages (normally page 1) pdf2txt -m 2 "$f" | grep -E 'Abstract|Copyright' 2>&1 > /dev/null if [[ $? != 0 ]]; then if [[ "$f" != *"reface"*".pdf" && "$f" != *"rganization.pdf" && "$f" != *"ponsors.pdf" && "$f" != *"ommittee"*".pdf" && "$f" != *"matter"*".pdf" ]] then ABSTRACTWORD="notfound" echo -e "\rPDF file $f seems to have no readable text included but only binary data" fi fi done if [[ "$ABSTRACTWORD" == "notfound" ]] ; then echo -e "\r ===> Make sure that paper PDFs have readable/selectable text in them; use a proper PDF printer driver on Windows such as PDFCreator or use LibreOffice PDF export" echo " " else echo -e "\rok " echo " " fi fi # ================ # Test "copyright" # ================ if [[ "$TOBETESTED" == *"copyright"* ]] ; then echo "" echo "(*) CEUR-WS standard copyright phrase" CREATIVECOMMONS="found" for f in *.pdf do echo -en "\rChecking $f " if isExcludedFile "$f" ; then continue fi # PDF files should have the string "Creative Commons License" on the first two pages; copyright year may not be 2022 or 2023 # left unmodified by the author when using the CEURART template # pdf2txt -m 2 "$f" | grep -E 'Creative.*Commons.*License|Commons.*License.*Attribution' 2>&1 > /dev/null pdf2txt -m 2 "$f" | grep -E 'Creative.*Commons.*License|Commons.*License.*Attribution' | grep -Ev '2022|2023' 2>&1 > /dev/null if [[ $? != 0 ]]; then CREATIVECOMMONS="notfound" echo -e "\rPDF file $f seems to lack the proper copyright clause or copyright year on page 1" fi done if [[ $CREATIVECOMMONS == "notfound" ]] ; then echo -e "\r ===> Make sure that paper PDFs have the correct copyright clause, see https://ceur-ws.org/HOWTOSUBMIT.html#CCBY-FOOTNOTE" echo " ===> Make sure that paper PDFs have the correct *year* in the copyright clause!" echo " " else echo -e "\rok " echo " " fi fi # ============ # Test "genai" # ============ if [[ "$TOBETESTED" == *"genai"* ]] ; then echo "" echo "(*) Declaration on Generative AI" GENAIDECL="ok" for f in *.pdf do echo -en "\rChecking $f " if isExcludedFile "$f" ; then continue fi temp_file=$(mktemp /tmp/pdftext.XXXXXX.txt) # pdftotext "$f" "$temp_file" pdf2txt "$f" > "$temp_file" # Test 1: PDF files should have a section "Declaration on Generative AI" # cat "$temp_file" | grep -E 'Declaration.* [G|g]enerative AI|[G|g]enerative AI *[D|d]eclaration|Declaration .. GenAI' 2>&1 > /dev/null cat "$temp_file" | grep -i -F 'declaration' | grep -E '[Gg]enerative\s+AI|[Gg]enAI' 2>&1 > /dev/null if [[ $? != 0 ]]; then GENAIDECL="notok" echo -e "\rPDF file $f seems to lack a section Declaration on Generative AI" continue; fi # Test 2: The phrase "The author ... not employed any Generative AI tools." is fine; not more checks needed cat "$temp_file" | grep -E 'The author.*not employed any Generative AI tools\.' 2>&1 > /dev/null if [[ $? == 0 ]]; then continue; # This file is ok fi # Test 3: Test on fake AI tool names from our template cat "$temp_file" | grep -E 'X-GPT-.|Gramby|X-AI-IMG' 2>&1 > /dev/null if [[ $? == 0 ]]; then GENAIDECL="notok" echo -e "\rPDF file $f seems to mention non existing tools X-GPT-.|Gramby|X-AI-IMG in its Declaration on Generative AI" continue; fi # If Test2 failed then Test 4 is required: The GenAI statement must contain the phrase "reviewed|*full responsibility" # Not yet happy with this solution. We want authors to follow our template but some make slight variations. Others big ones. # cat "$temp_file" | tr '\n' ' ' | grep -E 'reviewed|full responsibility' 2>&1 > /dev/null # if [[ $? != 0 ]]; #then # GENAIDECL="notok" # echo "PDF file $f seems to use a non-standard text for the Declaration on Generative AI" #fi /bin/rm "$temp_file" done if [[ $GENAIDECL == "notok" ]] ; then echo -e "\r ===> Make sure that paper PDFs have a Generative AI Declaration conforming https://ceur-ws.org/GenAI/Policy.html" echo " " else echo -e "\rok " echo " " fi fi # =========== # Test "logo" # =========== if [[ "$TOBETESTED" == *"logo"* ]] ; then echo "" echo "(*) Use of CEUR-WS.org logo or link or string CEUR Workshop Proceedings before publication" CEURWSLINKFOUND="notfound" for f in *.pdf do echo -en "\rChecking $f " if [ -f watermark-log.txt ]; then if grep -q "$f" watermark-log.txt; then continue; fi fi if isExcludedFile "$f" ; then continue fi # PDF files should not have the string "(CEUR-WS.org)" on the first two pages pdf2txt -m 2 "$f" | grep -E '(CEUR-WS.org)|CEUR Workshop Proceedings' 2>&1 > /dev/null if [[ $? == 0 ]]; then CEURWSLINKFOUND="found" echo -e "\rPDF file $f uses 'CEUR-WS.org' or 'CEUR Workshop Proceedings' before publication" fi done if [[ $CEURWSLINKFOUND == "found" ]] ; then echo -e "\r ===> Use the latest CEURART template from https://ceur-ws.org/Vol-XXX/, which does not have a footnote with the CEUR-WS logo; Do not use the string 'CEUR Workshop Proceedings' in the header or footer of papers" echo " " else echo -e "\rok " echo " " fi fi # ================= # Test "libertinus" # ================= if [[ "$TOBETESTED" == *"libertinus"* ]] ; then echo "" echo "(*) Libertinus font in paper PDFs" NONLIBERTINUSFOUND="no" if [[ ! -f $HOME/bin/check-libbyhead.py ]]; then echo " --> Extra test routine check-libbyhead.py missing. Result of this test is therefore incomplete." fi for f in *.pdf do echo -en "\rChecking $f " if isExcludedFile "$f" ; then continue fi # 2025-12-15: We inspect the actual use of Libertinus fonts on page 1 for the headings # This is not a complete test. So, we may have to extend it to body text as well # The restriction to page 1 is for efficiency of the test if [[ -f $HOME/bin/check-libbyhead.py ]]; then python3 $HOME/bin/check-libbyhead.py "$f" exit_code=$? if [ $exit_code -eq 1 ]; then NONLIBERTINUSFOUND="found" echo -e "\rPDF file $f seems not use Libertinus Sans font for headings" elif [ $exit_code -eq 2 ]; then NONLIBERTINUSFOUND="found" echo -e "\rPDF file $f seems not use Libertinus Serif font for body text" elif [ $exit_code -eq 3 ]; then NONLIBERTINUSFOUND="found" echo -e "\rPDF file $f seems not use Libertinus Serif font for body text and Libertinus Sans font for headings" continue elif [ $exit_code -ne 0 ]; then echo -e "\rUnexpected error with PDF file $f" fi fi # Files created with Windows Word/LibreOfffice apparently use CIDFont+F instead Libertinus as font name (?) # We use two methods for Libertinus detection. First, 'strings | grep FontName' extracts it if it is used as a PDF # command. Second, pdffonts is used when the fonts are embedded differently. strings "$f" | grep FontName | grep -q -E 'Libertinus|CIDFont.F5' if [[ $? != 0 ]]; then pdffonts 2> /dev/null $f | grep -q -E 'Libertinus|CIDFont.F5' if [[ $? != 0 ]] ; then NONLIBERTINUSFOUND="found" echo -e "\rPDF file $f does not use Libertinus font family" fi fi done if [[ $NONLIBERTINUSFOUND == "found" ]] ; then echo -e "\r ===> Make sure that paper PDFs use the Libertinus font family; instructions to install Libertinus fonts are included in https://ceur-ws.org/Vol-XXX/CEUR-Template-1col.odt; paper PDFs should all use Libertinus fonts, prefaces that have no paper character do not necessarily need to use Libertinus; do not use Word/MS365 but rather use LibreOffice and its PDF exporter" echo " " else echo -e "\rok " echo " " fi fi # ================= # Test "duplicates" # ================= if [[ "$TOBETESTED" == *"duplicates"* ]] ; then echo "" echo "(*) Duplicate PDF files" DUPPDFSFILES=`find . -name '*.pdf' ! -empty -type f -exec md5sum {} + | sort | uniq -w32 -dD` if [ "$DUPPDFSFILES" != "" ] then echo -e "\r "; echo " ==========> ERROR (P2) with duplicate PDF files!!!!" find . -name '*.pdf' ! -empty -type f -exec md5sum {} + | sort | uniq -w32 -dD echo " " else echo -e "\rok " echo " " fi fi # =============== # Test "leftover" # =============== if [[ "$TOBETESTED" == *"leftover"* ]] ; then echo "" echo "(*) Leftover elements from CEURART in the footer of page 1" NOLEFTOVER="yes" for f in *.pdf do echo -en "\rChecking $f " # PDF files should not have leftover elements from the CEURART template in the footer pdf2txt -m 2 "$f" | grep -E '0000-0000-0000-0000|Woodstock.*22|0000-0002-0877-7063.*0000-0001-7116-9338.*0000-0002-9421-8566' 2>&1 > /dev/null if [[ $? == 0 ]]; then NOLEFTOVER="no" echo -e "\rPDF file $f seems to have some leftover elements from the footer of the CEURART template on page 1 such as ORCIDs or the event name" fi done if [[ $NOLEFTOVER == "no" ]] ; then echo -e "\r ===> Make sure that paper PDFs have correct data in the footnote section of page 1" echo " " else echo -e "\rok " echo " " fi fi # ============== # Test "columns" # ============== if [[ "$TOBETESTED" == *"columns"* ]] ; then if [[ ! -x $HOME/bin/check-columns ]]; then echo " " echo "(*) Check manually if all paper PDFs are in CEURART one-column style" echo " " else # A better test on one-column echo "" echo "(*) Check on one-column style" TWOCOLUMNFOUND="no" for f in *.pdf do if isExcludedFile "$f" ; then continue fi echo -en "\rChecking $f " $HOME/bin/check-columns $f # previous command returns error code 1 if a file is presumably in two-column mode if [ $? -ne 0 ]; then TWOCOLUMNFOUND="yes" echo -e "\rPDF file $f seems to be in two column format" fi done if [[ "$TWOCOLUMNFOUND" == "yes" ]] ; then echo -e "\r ===> Make sure that paper PDFs are in one-column CEURART format, see https://ceur-ws.org/Vol-XXX/" echo " " else echo -e "\rok " echo " " fi fi fi # ================================================================ # Test "titles" and "authors" in index.html vs. the paper PDF file # ================================================================ # # 2026-02-15: pretty complete but does not find errors when the PDF file has more authors # than the corresponding entry in index.html # if [[ "$TOBETESTED" == *"titles"* ]] ; then echo "" echo "(*) Check whether the paper titles and authors in index.html match the PDF file" # We use a generic MISMATCH variable to capture any issues in this block MISMATCH_FOUND="no" # Capture Perl output directly perl_output=$(perl -Mopen=locale -0777 -ne ' use strict; use warnings; my $any_mismatch = 0; # Subroutine to aggressively normalize text (lowercase, no punctuation, no spaces) sub normalize_text { my ($text) = @_; return "" unless defined($text); $text =~ s/[\n\t\r]+/ /g; $text =~ s/\s+/ /g; $text = lc($text); $text =~ s/[\*\x{22c6}]//g; # Remove standard * and Unicode ⋆ $text =~ s/[^a-z0-9]/ /g; $text =~ s/\s+//g; # Remove ALL spaces for a condensed comparison return $text; } while (/]*>(.*?)<\/li>/sg) { my $li = $1; my $pdf; my $title; my @authors; # Extract PDF filename if ($li =~ /(.*?)<\/span>/s) { $title = $1; } elsif ($li =~ /]*class="CEURTITLE">(.*?)<\/a>/s) { $title = $1; } # Extract Authors from index.html while ($li =~ /(.*?)<\/span>/sg) { my $auth = $1; $auth =~ s/\s+/ /g; $auth =~ s/ \*$//; # Remove trailing asterisk if not caught by normalization push @authors, $auth; } # Check if PDF file exists locally if (!-f $pdf) { print "Error: File $pdf missing\n"; $any_mismatch = 1; next; } # PDF text extraction (first page only) my $page1 = `pdftotext -q -f 1 -l 1 "$pdf" - 2>/dev/null`; my $normpage = normalize_text($page1); # --- 1. TITLE CHECK --- if ($title) { my $normtitle = normalize_text($title); my $escaped_t = quotemeta($normtitle); if ($normpage !~ /$escaped_t/) { print "$pdf: Expected CEURTITLE \"$title\" not found on first page\n"; $any_mismatch = 1; } } # --- 2. AUTHOR CHECK (Presence and Sequence) --- my %author_positions; my $all_authors_present = 1; foreach my $author (@authors) { my $norm_auth = normalize_text($author); my $escaped_a = quotemeta($norm_auth); # Find absolute position on page if ($normpage =~ /($escaped_a)/) { # $-[0] is the start index of the match in the string $author_positions{$author} = $-[0]; } else { print "$pdf: Expected CEURAUTHOR \"$author\" not found on first page\n"; $all_authors_present = 0; $any_mismatch = 1; } } # Only check order if everyone was actually found if ($all_authors_present && scalar @authors > 1) { for (my $i = 0; $i < $#authors; $i++) { my $curr = $authors[$i]; my $next = $authors[$i+1]; if ($author_positions{$curr} > $author_positions{$next}) { print "$pdf: Author sequence error. \"$curr\" must appear before \"$next\".\n"; $any_mismatch = 1; } } } } } # Final flag for bash if ($any_mismatch) { print "__MISMATCH__=yes\n"; } else { print "__MISMATCH__=no\n"; } ' index.html) # Parse the Perl output to set the Bash variable while IFS= read -r line; do if [[ "$line" == __MISMATCH__=* ]]; then MISMATCH_FOUND="${line#__MISMATCH__=}" else # Echo the specific error findings (e.g. "missing author") echo "$line" fi done <<< "$perl_output" if [[ "$MISMATCH_FOUND" == "yes" ]] ; then echo " ===> Make sure that paper CEURTITLE and CEURAUTHORs in index.html match the PDF!" echo " Check both the text content and the order of authors." echo " " else echo "ok" echo " " fi fi # ================= # Test "executable" # ================= if [[ "$TOBETESTED" == *"executable"* ]] ; then # This test is quite shaky and can easily be defeated by malicious attackers by obfuscating the JS code echo "" echo "(*) Searching for executable code (JS/Launch/OpenAction) in ALL PDF files" EXEC_FOUND="no" for f in *.pdf do echo -en "\rChecking $f " # 1. Use strings to find the raw PDF tags. # 2. Search for JavaScript, Launch, or OpenActions that lead to a specific Service (/S). # 3. This excludes the "Initial View" (/XYZ) actions found for example in our templates. MATCH=$(strings "$f" | grep -Ei "/S\s*/JavaScript|/JS\s+|/Launch|/OpenAction\s*/S" | \ grep -vE "Font|Descriptor|Encoding|Widths|ProcSet|Subset" | \ grep -vE "/JS-|Libertinus|CIDFont" | head -n 2) if [[ -n "$MATCH" ]]; then EXEC_FOUND="yes" echo -e "\rPDF file $f contains active content near:" # Display the first 2 lines of the match found echo "$MATCH" | head -n 2 | sed 's/^/ - /' fi done if [[ $EXEC_FOUND == "yes" ]] ; then echo "" echo " ===> WARNING: One or more PDFs contain active content (JavaScript or Launch commands)." echo " CEUR-WS papers should generally be static documents without embedded logic." echo " " else echo -e "\rok " echo " " fi fi