From 205b99dfa0f138d81a6b4aa4d83918e32073acfb Mon Sep 17 00:00:00 2001
From: Ida Dittrich <i.dittrich@valueon.ch>
Date: Mon, 23 Feb 2026 11:20:51 +0100
Subject: [PATCH] updated pdf extractor

---
 .../extractors/extractorPdf.py                | 36 ++++++++++++++++---
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/modules/services/serviceExtraction/extractors/extractorPdf.py b/modules/services/serviceExtraction/extractors/extractorPdf.py
index 8a670e57..244aef90 100644
--- a/modules/services/serviceExtraction/extractors/extractorPdf.py
+++ b/modules/services/serviceExtraction/extractors/extractorPdf.py
@@ -73,13 +73,14 @@ class PdfExtractor(Extractor):
             ))
             return parts
 
-        # Extract text per page with PyPDF2
+        # Extract text per page with PyMuPDF (same lib as in-place search - ensures extraction matches PDF text layer)
         try:
             with io.BytesIO(fileBytes) as buf:
-                reader = PyPDF2.PdfReader(buf)
-                for i, page in enumerate(reader.pages):
+                doc = fitz.open(stream=buf.getvalue(), filetype="pdf")
+                for i in range(len(doc)):
                     try:
-                        text = page.extract_text() or ""
+                        page = doc[i]
+                        text = page.get_text() or ""
                         if text.strip():
                             parts.append(ContentPart(
                                 id=makeId(),
@@ -92,13 +93,38 @@ class PdfExtractor(Extractor):
                             ))
                     except Exception:
                         continue
+                doc.close()
         except Exception:
             pass
 
+        # Fallback to PyPDF2 if PyMuPDF text extraction failed or returned nothing
+        has_text = any(getattr(p, 'typeGroup', '') == "text" for p in parts)
+        if not has_text:
+            try:
+                with io.BytesIO(fileBytes) as buf:
+                    reader = PyPDF2.PdfReader(buf)
+                    for i, page in enumerate(reader.pages):
+                        try:
+                            text = page.extract_text() or ""
+                            if text.strip():
+                                parts.append(ContentPart(
+                                    id=makeId(),
+                                    parentId=rootId,
+                                    label=f"page_{i+1}",
+                                    typeGroup="text",
+                                    mimeType="text/plain",
+                                    data=text,
+                                    metadata={"pages": 1, "pageIndex": i, "size": len(text.encode('utf-8'))}
+                                ))
+                        except Exception:
+                            continue
+            except Exception:
+                pass
+
         # Extract images with PyMuPDF
         try:
             with io.BytesIO(fileBytes) as buf2:
-                doc = fitz.open(stream=buf2, filetype="pdf")
+                doc = fitz.open(stream=buf2.getvalue(), filetype="pdf")
                 for i in range(len(doc)):
                     page = doc[i]
                     images = page.get_images(full=True)