From 205b99dfa0f138d81a6b4aa4d83918e32073acfb Mon Sep 17 00:00:00 2001 From: Ida Dittrich Date: Mon, 23 Feb 2026 11:20:51 +0100 Subject: [PATCH] updated pdf extractor --- .../extractors/extractorPdf.py | 36 ++++++++++++++++--- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/modules/services/serviceExtraction/extractors/extractorPdf.py b/modules/services/serviceExtraction/extractors/extractorPdf.py index 8a670e57..244aef90 100644 --- a/modules/services/serviceExtraction/extractors/extractorPdf.py +++ b/modules/services/serviceExtraction/extractors/extractorPdf.py @@ -73,13 +73,14 @@ class PdfExtractor(Extractor): )) return parts - # Extract text per page with PyPDF2 + # Extract text per page with PyMuPDF (same lib as in-place search - ensures extraction matches PDF text layer) try: with io.BytesIO(fileBytes) as buf: - reader = PyPDF2.PdfReader(buf) - for i, page in enumerate(reader.pages): + doc = fitz.open(stream=buf.getvalue(), filetype="pdf") + for i in range(len(doc)): try: - text = page.extract_text() or "" + page = doc[i] + text = page.get_text() or "" if text.strip(): parts.append(ContentPart( id=makeId(), @@ -92,13 +93,38 @@ class PdfExtractor(Extractor): )) except Exception: continue + doc.close() except Exception: pass + # Fallback to PyPDF2 if PyMuPDF text extraction failed or returned nothing + has_text = any(getattr(p, 'typeGroup', '') == "text" for p in parts) + if not has_text: + try: + with io.BytesIO(fileBytes) as buf: + reader = PyPDF2.PdfReader(buf) + for i, page in enumerate(reader.pages): + try: + text = page.extract_text() or "" + if text.strip(): + parts.append(ContentPart( + id=makeId(), + parentId=rootId, + label=f"page_{i+1}", + typeGroup="text", + mimeType="text/plain", + data=text, + metadata={"pages": 1, "pageIndex": i, "size": len(text.encode('utf-8'))} + )) + except Exception: + continue + except Exception: + pass + # Extract images with PyMuPDF try: with io.BytesIO(fileBytes) as buf2: - doc = fitz.open(stream=buf2, filetype="pdf") + doc = fitz.open(stream=buf2.getvalue(), filetype="pdf") for i in range(len(doc)): page = doc[i] images = page.get_images(full=True)