89 lines
3.1 KiB
Python
89 lines
3.1 KiB
Python
from typing import Any, Dict, List
|
|
import io
|
|
|
|
from ..utils import makeId
|
|
from modules.datamodels.datamodelExtraction import ContentPart
|
|
from ..subRegistry import Extractor
|
|
|
|
|
|
class DocxExtractor(Extractor):
|
|
def __init__(self):
|
|
self._loaded = False
|
|
self._haveLibs = False
|
|
|
|
def _load(self):
|
|
if self._loaded:
|
|
return
|
|
self._loaded = True
|
|
try:
|
|
global docx
|
|
import docx # python-docx
|
|
self._haveLibs = True
|
|
except Exception:
|
|
self._haveLibs = False
|
|
|
|
def detect(self, fileName: str, mimeType: str, headBytes: bytes) -> bool:
|
|
return mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or (fileName or "").lower().endswith(".docx")
|
|
|
|
def extract(self, fileBytes: bytes, context: Dict[str, Any]) -> List[ContentPart]:
|
|
self._load()
|
|
parts: List[ContentPart] = []
|
|
rootId = makeId()
|
|
parts.append(ContentPart(
|
|
id=rootId,
|
|
parentId=None,
|
|
label="docx",
|
|
typeGroup="container",
|
|
mimeType="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
data="",
|
|
metadata={"size": len(fileBytes)}
|
|
))
|
|
|
|
if not self._haveLibs:
|
|
parts.append(ContentPart(
|
|
id=makeId(),
|
|
parentId=rootId,
|
|
label="binary",
|
|
typeGroup="binary",
|
|
mimeType="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
data="",
|
|
metadata={"size": len(fileBytes), "warning": "DOCX lib not available"}
|
|
))
|
|
return parts
|
|
|
|
with io.BytesIO(fileBytes) as buf:
|
|
d = docx.Document(buf)
|
|
# paragraphs
|
|
for i, para in enumerate(d.paragraphs):
|
|
text = para.text or ""
|
|
if text.strip():
|
|
parts.append(ContentPart(
|
|
id=makeId(),
|
|
parentId=rootId,
|
|
label=f"p_{i+1}",
|
|
typeGroup="text",
|
|
mimeType="text/plain",
|
|
data=text,
|
|
metadata={"size": len(text.encode('utf-8'))}
|
|
))
|
|
# tables → CSV rows
|
|
for ti, table in enumerate(d.tables):
|
|
rows: list[str] = []
|
|
for row in table.rows:
|
|
cells = [ (cell.text or "").replace('"', '""') for cell in row.cells ]
|
|
rows.append(",".join([f'"{c}"' for c in cells]))
|
|
csvData = "\n".join(rows)
|
|
if csvData:
|
|
parts.append(ContentPart(
|
|
id=makeId(),
|
|
parentId=rootId,
|
|
label=f"table_{ti+1}",
|
|
typeGroup="table",
|
|
mimeType="text/csv",
|
|
data=csvData,
|
|
metadata={"size": len(csvData.encode('utf-8'))}
|
|
))
|
|
|
|
return parts
|
|
|
|
|