updated pdf extractor

This commit is contained in:
Ida Dittrich 2026-02-23 11:20:51 +01:00
parent 5120fbc503
commit 205b99dfa0

View file

@ -73,13 +73,14 @@ class PdfExtractor(Extractor):
)) ))
return parts return parts
# Extract text per page with PyPDF2 # Extract text per page with PyMuPDF (same lib as in-place search - ensures extraction matches PDF text layer)
try: try:
with io.BytesIO(fileBytes) as buf: with io.BytesIO(fileBytes) as buf:
reader = PyPDF2.PdfReader(buf) doc = fitz.open(stream=buf.getvalue(), filetype="pdf")
for i, page in enumerate(reader.pages): for i in range(len(doc)):
try: try:
text = page.extract_text() or "" page = doc[i]
text = page.get_text() or ""
if text.strip(): if text.strip():
parts.append(ContentPart( parts.append(ContentPart(
id=makeId(), id=makeId(),
@ -92,13 +93,38 @@ class PdfExtractor(Extractor):
)) ))
except Exception: except Exception:
continue continue
doc.close()
except Exception: except Exception:
pass pass
# Fallback to PyPDF2 if PyMuPDF text extraction failed or returned nothing
has_text = any(getattr(p, 'typeGroup', '') == "text" for p in parts)
if not has_text:
try:
with io.BytesIO(fileBytes) as buf:
reader = PyPDF2.PdfReader(buf)
for i, page in enumerate(reader.pages):
try:
text = page.extract_text() or ""
if text.strip():
parts.append(ContentPart(
id=makeId(),
parentId=rootId,
label=f"page_{i+1}",
typeGroup="text",
mimeType="text/plain",
data=text,
metadata={"pages": 1, "pageIndex": i, "size": len(text.encode('utf-8'))}
))
except Exception:
continue
except Exception:
pass
# Extract images with PyMuPDF # Extract images with PyMuPDF
try: try:
with io.BytesIO(fileBytes) as buf2: with io.BytesIO(fileBytes) as buf2:
doc = fitz.open(stream=buf2, filetype="pdf") doc = fitz.open(stream=buf2.getvalue(), filetype="pdf")
for i in range(len(doc)): for i in range(len(doc)):
page = doc[i] page = doc[i]
images = page.get_images(full=True) images = page.get_images(full=True)