module tests completed
This commit is contained in:
parent
db456f1667
commit
0280879968
3 changed files with 30 additions and 12 deletions
|
|
@ -240,7 +240,7 @@ class ExtractionService:
|
||||||
partSummary["dataPreview"] = f"[Large data: {len(part.data)} chars - truncated]"
|
partSummary["dataPreview"] = f"[Large data: {len(part.data)} chars - truncated]"
|
||||||
extractionSummary["parts"].append(partSummary)
|
extractionSummary["parts"].append(partSummary)
|
||||||
|
|
||||||
writeDebugFile(json.dumps(extractionSummary, indent=2, ensure_ascii=False), f"extraction_result_{doc.fileName}")
|
writeDebugFile(json.dumps(extractionSummary, indent=2, ensure_ascii=False), f"extraction_result_{doc.fileName}.txt")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Failed to write extraction debug file: {str(e)}")
|
logger.debug(f"Failed to write extraction debug file: {str(e)}")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -700,12 +700,19 @@ class RendererHtml(BaseRenderer):
|
||||||
import base64
|
import base64
|
||||||
import re
|
import re
|
||||||
|
|
||||||
# Find all image data URIs in HTML (verschiedene MIME-Types unterstützen)
|
# Find entire img tags with data URIs and replace them
|
||||||
# Pattern: data:image/[type];base64,<base64>
|
# Pattern: <img src="data:image/[type];base64,<base64>" [other attributes]>
|
||||||
dataUriPattern = r'data:image/[^;]+;base64,([A-Za-z0-9+/=]+)'
|
imgTagPattern = r'<img\s+src="data:image/[^"]+"[^>]*>'
|
||||||
|
|
||||||
def replaceDataUri(match):
|
def replaceImgTag(match):
|
||||||
base64Data = match.group(1)
|
imgTag = match.group(0)
|
||||||
|
|
||||||
|
# Extract base64 data from the img tag
|
||||||
|
base64Match = re.search(r'data:image/[^;]+;base64,([A-Za-z0-9+/=]+)', imgTag)
|
||||||
|
if not base64Match:
|
||||||
|
return imgTag # Return original if no base64 found
|
||||||
|
|
||||||
|
base64Data = base64Match.group(1)
|
||||||
|
|
||||||
# Find matching image in images list
|
# Find matching image in images list
|
||||||
matchingImage = None
|
matchingImage = None
|
||||||
|
|
@ -721,12 +728,13 @@ class RendererHtml(BaseRenderer):
|
||||||
# Use filename from image data (generated from section ID)
|
# Use filename from image data (generated from section ID)
|
||||||
filename = matchingImage.get("filename", f"image_{images.index(matchingImage) + 1}.png")
|
filename = matchingImage.get("filename", f"image_{images.index(matchingImage) + 1}.png")
|
||||||
|
|
||||||
# Replace with relative path (ohne Pfad, nur Dateiname)
|
# Extract existing alt text or use from matchingImage
|
||||||
# Escape HTML in altText and caption to prevent injection
|
altMatch = re.search(r'alt="([^"]*)"', imgTag)
|
||||||
altText = html.escape(str(matchingImage.get("altText", "Image")))
|
existingAlt = altMatch.group(1) if altMatch else ""
|
||||||
|
altText = html.escape(str(matchingImage.get("altText", existingAlt or "Image")))
|
||||||
caption = html.escape(str(matchingImage.get("caption", ""))) if matchingImage.get("caption") else ""
|
caption = html.escape(str(matchingImage.get("caption", ""))) if matchingImage.get("caption") else ""
|
||||||
|
|
||||||
# Entferne IMAGE_MARKER Kommentar falls vorhanden
|
# Create new img tag with filename
|
||||||
imgTag = f'<img src="{filename}" alt="{altText}">'
|
imgTag = f'<img src="{filename}" alt="{altText}">'
|
||||||
|
|
||||||
if caption:
|
if caption:
|
||||||
|
|
@ -737,8 +745,8 @@ class RendererHtml(BaseRenderer):
|
||||||
# Keep original if no match found
|
# Keep original if no match found
|
||||||
return match.group(0)
|
return match.group(0)
|
||||||
|
|
||||||
# Replace all data URIs (auch IMAGE_MARKER Kommentare entfernen)
|
# Replace all img tags with data URIs (auch IMAGE_MARKER Kommentare entfernen)
|
||||||
updatedHtml = re.sub(dataUriPattern, replaceDataUri, htmlContent)
|
updatedHtml = re.sub(imgTagPattern, replaceImgTag, htmlContent)
|
||||||
# Entferne IMAGE_MARKER Kommentare die übrig geblieben sind
|
# Entferne IMAGE_MARKER Kommentare die übrig geblieben sind
|
||||||
updatedHtml = re.sub(r'<!--IMAGE_MARKER:[^>]+-->', '', updatedHtml)
|
updatedHtml = re.sub(r'<!--IMAGE_MARKER:[^>]+-->', '', updatedHtml)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -180,6 +180,16 @@ def convertDocumentDataToString(document_data: Any, file_extension: str) -> str:
|
||||||
try:
|
try:
|
||||||
if document_data is None:
|
if document_data is None:
|
||||||
return ""
|
return ""
|
||||||
|
if isinstance(document_data, bytes):
|
||||||
|
# WICHTIG: Decode bytes to string for text files (HTML, text, etc.)
|
||||||
|
try:
|
||||||
|
return document_data.decode('utf-8')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
# Fallback: try latin1 or return with error replacement
|
||||||
|
try:
|
||||||
|
return document_data.decode('latin1')
|
||||||
|
except Exception:
|
||||||
|
return document_data.decode('utf-8', errors='replace')
|
||||||
if isinstance(document_data, str):
|
if isinstance(document_data, str):
|
||||||
return document_data
|
return document_data
|
||||||
if isinstance(document_data, dict):
|
if isinstance(document_data, dict):
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue