updated pdf extractor

2026-02-23 11:20:51 +01:00 · 2026-02-23 11:20:51 +01:00 · 205b99dfa0
commit 205b99dfa0
parent 5120fbc503
1 changed files with 31 additions and 5 deletions
--- a/modules/services/serviceExtraction/extractors/extractorPdf.py
+++ b/modules/services/serviceExtraction/extractors/extractorPdf.py
@ -73,13 +73,14 @@ class PdfExtractor(Extractor):
            ))
            return parts

-        # Extract text per page with PyPDF2
+        # Extract text per page with PyMuPDF (same lib as in-place search - ensures extraction matches PDF text layer)
        try:
            with io.BytesIO(fileBytes) as buf:
-                reader = PyPDF2.PdfReader(buf)
-                for i, page in enumerate(reader.pages):
+                doc = fitz.open(stream=buf.getvalue(), filetype="pdf")
+                for i in range(len(doc)):
                    try:
-                        text = page.extract_text() or ""
+                        page = doc[i]
+                        text = page.get_text() or ""
                        if text.strip():
                            parts.append(ContentPart(
                                id=makeId(),
@ -92,13 +93,38 @@ class PdfExtractor(Extractor):
                            ))
                    except Exception:
                        continue
+                doc.close()
        except Exception:
            pass

+        # Fallback to PyPDF2 if PyMuPDF text extraction failed or returned nothing
+        has_text = any(getattr(p, 'typeGroup', '') == "text" for p in parts)
+        if not has_text:
+            try:
+                with io.BytesIO(fileBytes) as buf:
+                    reader = PyPDF2.PdfReader(buf)
+                    for i, page in enumerate(reader.pages):
+                        try:
+                            text = page.extract_text() or ""
+                            if text.strip():
+                                parts.append(ContentPart(
+                                    id=makeId(),
+                                    parentId=rootId,
+                                    label=f"page_{i+1}",
+                                    typeGroup="text",
+                                    mimeType="text/plain",
+                                    data=text,
+                                    metadata={"pages": 1, "pageIndex": i, "size": len(text.encode('utf-8'))}
+                                ))
+                        except Exception:
+                            continue
+            except Exception:
+                pass
+
        # Extract images with PyMuPDF
        try:
            with io.BytesIO(fileBytes) as buf2:
-                doc = fitz.open(stream=buf2, filetype="pdf")
+                doc = fitz.open(stream=buf2.getvalue(), filetype="pdf")
                for i in range(len(doc)):
                    page = doc[i]
                    images = page.get_images(full=True)