Add shell script to rename PDFs by extracting customer numbers

- rename_pdfs.sh: Main script that renames PDFs based on content - extract_customer.py: Helper script to extract customer number from PDF
2026-03-23 14:38:56 +01:00
parent 1cc34b1a5c
commit b27c24f806
2 changed files with 59 additions and 0 deletions
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+import sys
+import re
+from pypdf import PdfReader
+
+def extract_customer(pdf_path):
+    try:
+        reader = PdfReader(pdf_path)
+        for page in reader.pages:
+            text = page.extract_text()
+            if text:
+                if 'Fakturaöversikt' in text:
+                    return 'index'
+                match = re.search(r'Kund\s+(\d+)', text)
+                if match:
+                    return f'customer_{match.group(1)}'
+    except Exception as e:
+        pass
+    return None
+
+if __name__ == '__main__':
+    if len(sys.argv) > 1:
+        result = extract_customer(sys.argv[1])
+        if result:
+            print(result)
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# Script to rename PDF files based on customer number extracted from their content
+# Usage: ./rename_pdfs.sh [directory]
+# Default directory: output
+
+OUTPUT_DIR="${1:-output}"
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+FULL_DIR="$SCRIPT_DIR/$OUTPUT_DIR"
+
+if [ ! -d "$FULL_DIR" ]; then
+    echo "Error: Directory '$FULL_DIR' not found"
+    exit 1
+fi
+
+for pdf in "$FULL_DIR"/*.pdf; do
+    if [ -f "$pdf" ]; then
+        filename=$(basename "$pdf")
+        
+        # Extract text from PDF using Python and pypdf
+        result=$(python3 "$SCRIPT_DIR/extract_customer.py" "$pdf" 2>/dev/null)
+
+        if [ -n "$result" ]; then
+            new_name="${result}.pdf"
+            if [ "$filename" != "$new_name" ]; then
+                mv "$pdf" "$FULL_DIR/$new_name"
+                echo "Renamed: $filename -> $new_name"
+            fi
+        else
+            echo "Could not extract customer number from: $filename"
+        fi
+    fi
+done