updated pdf extractor
This commit is contained in:
parent
5120fbc503
commit
205b99dfa0
1 changed files with 31 additions and 5 deletions
|
|
@ -73,13 +73,14 @@ class PdfExtractor(Extractor):
|
||||||
))
|
))
|
||||||
return parts
|
return parts
|
||||||
|
|
||||||
# Extract text per page with PyPDF2
|
# Extract text per page with PyMuPDF (same lib as in-place search - ensures extraction matches PDF text layer)
|
||||||
try:
|
try:
|
||||||
with io.BytesIO(fileBytes) as buf:
|
with io.BytesIO(fileBytes) as buf:
|
||||||
reader = PyPDF2.PdfReader(buf)
|
doc = fitz.open(stream=buf.getvalue(), filetype="pdf")
|
||||||
for i, page in enumerate(reader.pages):
|
for i in range(len(doc)):
|
||||||
try:
|
try:
|
||||||
text = page.extract_text() or ""
|
page = doc[i]
|
||||||
|
text = page.get_text() or ""
|
||||||
if text.strip():
|
if text.strip():
|
||||||
parts.append(ContentPart(
|
parts.append(ContentPart(
|
||||||
id=makeId(),
|
id=makeId(),
|
||||||
|
|
@ -92,13 +93,38 @@ class PdfExtractor(Extractor):
|
||||||
))
|
))
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
|
doc.close()
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Fallback to PyPDF2 if PyMuPDF text extraction failed or returned nothing
|
||||||
|
has_text = any(getattr(p, 'typeGroup', '') == "text" for p in parts)
|
||||||
|
if not has_text:
|
||||||
|
try:
|
||||||
|
with io.BytesIO(fileBytes) as buf:
|
||||||
|
reader = PyPDF2.PdfReader(buf)
|
||||||
|
for i, page in enumerate(reader.pages):
|
||||||
|
try:
|
||||||
|
text = page.extract_text() or ""
|
||||||
|
if text.strip():
|
||||||
|
parts.append(ContentPart(
|
||||||
|
id=makeId(),
|
||||||
|
parentId=rootId,
|
||||||
|
label=f"page_{i+1}",
|
||||||
|
typeGroup="text",
|
||||||
|
mimeType="text/plain",
|
||||||
|
data=text,
|
||||||
|
metadata={"pages": 1, "pageIndex": i, "size": len(text.encode('utf-8'))}
|
||||||
|
))
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# Extract images with PyMuPDF
|
# Extract images with PyMuPDF
|
||||||
try:
|
try:
|
||||||
with io.BytesIO(fileBytes) as buf2:
|
with io.BytesIO(fileBytes) as buf2:
|
||||||
doc = fitz.open(stream=buf2, filetype="pdf")
|
doc = fitz.open(stream=buf2.getvalue(), filetype="pdf")
|
||||||
for i in range(len(doc)):
|
for i in range(len(doc)):
|
||||||
page = doc[i]
|
page = doc[i]
|
||||||
images = page.get_images(full=True)
|
images = page.get_images(full=True)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue