updated pdf extractor
This commit is contained in:
parent
5120fbc503
commit
205b99dfa0
1 changed files with 31 additions and 5 deletions
|
|
@ -73,13 +73,14 @@ class PdfExtractor(Extractor):
|
|||
))
|
||||
return parts
|
||||
|
||||
# Extract text per page with PyPDF2
|
||||
# Extract text per page with PyMuPDF (same lib as in-place search - ensures extraction matches PDF text layer)
|
||||
try:
|
||||
with io.BytesIO(fileBytes) as buf:
|
||||
reader = PyPDF2.PdfReader(buf)
|
||||
for i, page in enumerate(reader.pages):
|
||||
doc = fitz.open(stream=buf.getvalue(), filetype="pdf")
|
||||
for i in range(len(doc)):
|
||||
try:
|
||||
text = page.extract_text() or ""
|
||||
page = doc[i]
|
||||
text = page.get_text() or ""
|
||||
if text.strip():
|
||||
parts.append(ContentPart(
|
||||
id=makeId(),
|
||||
|
|
@ -92,13 +93,38 @@ class PdfExtractor(Extractor):
|
|||
))
|
||||
except Exception:
|
||||
continue
|
||||
doc.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to PyPDF2 if PyMuPDF text extraction failed or returned nothing
|
||||
has_text = any(getattr(p, 'typeGroup', '') == "text" for p in parts)
|
||||
if not has_text:
|
||||
try:
|
||||
with io.BytesIO(fileBytes) as buf:
|
||||
reader = PyPDF2.PdfReader(buf)
|
||||
for i, page in enumerate(reader.pages):
|
||||
try:
|
||||
text = page.extract_text() or ""
|
||||
if text.strip():
|
||||
parts.append(ContentPart(
|
||||
id=makeId(),
|
||||
parentId=rootId,
|
||||
label=f"page_{i+1}",
|
||||
typeGroup="text",
|
||||
mimeType="text/plain",
|
||||
data=text,
|
||||
metadata={"pages": 1, "pageIndex": i, "size": len(text.encode('utf-8'))}
|
||||
))
|
||||
except Exception:
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Extract images with PyMuPDF
|
||||
try:
|
||||
with io.BytesIO(fileBytes) as buf2:
|
||||
doc = fitz.open(stream=buf2, filetype="pdf")
|
||||
doc = fitz.open(stream=buf2.getvalue(), filetype="pdf")
|
||||
for i in range(len(doc)):
|
||||
page = doc[i]
|
||||
images = page.get_images(full=True)
|
||||
|
|
|
|||
Loading…
Reference in a new issue