updated pdf extractor

2026-02-23 11:20:51 +01:00 · 2026-02-23 11:20:51 +01:00 · 205b99dfa0
commit 205b99dfa0
parent 5120fbc503
1 changed files with 31 additions and 5 deletions
--- a/modules/services/serviceExtraction/extractors/extractorPdf.py
+++ b/modules/services/serviceExtraction/extractors/extractorPdf.py
@ -73,13 +73,14 @@ class PdfExtractor(Extractor):
            ))
            return parts
-        # Extract text per page with PyPDF2
+        # Extract text per page with PyMuPDF (same lib as in-place search - ensures extraction matches PDF text layer)
        try:
            with io.BytesIO(fileBytes) as buf:
-                reader = PyPDF2.PdfReader(buf)
+                doc = fitz.open(stream=buf.getvalue(), filetype="pdf")
-                for i, page in enumerate(reader.pages):
+                for i in range(len(doc)):
                    try:
-                        text = page.extract_text() or ""
+                        page = doc[i]
                        text = page.get_text() or ""
                        if text.strip():
                            parts.append(ContentPart(
                                id=makeId(),
@ -92,13 +93,38 @@ class PdfExtractor(Extractor):
                            ))
                    except Exception:
                        continue
                doc.close()
        except Exception:
            pass
        # Fallback to PyPDF2 if PyMuPDF text extraction failed or returned nothing
        has_text = any(getattr(p, 'typeGroup', '') == "text" for p in parts)
        if not has_text:
            try:
                with io.BytesIO(fileBytes) as buf:
                    reader = PyPDF2.PdfReader(buf)
                    for i, page in enumerate(reader.pages):
                        try:
                            text = page.extract_text() or ""
                            if text.strip():
                                parts.append(ContentPart(
                                    id=makeId(),
                                    parentId=rootId,
                                    label=f"page_{i+1}",
                                    typeGroup="text",
                                    mimeType="text/plain",
                                    data=text,
                                    metadata={"pages": 1, "pageIndex": i, "size": len(text.encode('utf-8'))}
                                ))
                        except Exception:
                            continue
            except Exception:
                pass
        # Extract images with PyMuPDF
        try:
            with io.BytesIO(fileBytes) as buf2:
-                doc = fitz.open(stream=buf2, filetype="pdf")
+                doc = fitz.open(stream=buf2.getvalue(), filetype="pdf")
                for i in range(len(doc)):
                    page = doc[i]
                    images = page.get_images(full=True)