platform-core/modules/serviceCenter/services/serviceGeneration/subDocumentUtility.py

# Copyright (c) 2025 Patrick Motsch
# All rights reserved.
import json
import logging
import os
import re
from typing import Any, Dict, List, Optional
import io

logger = logging.getLogger(__name__)

_MAX_AUTO_TABLE_COLS = 64
_MAX_AUTO_TABLE_ROWS = 5000
_MAX_AUTO_CELL_CHARS = 8000


def _sanitize_cell_for_pipe_table(cell: str) -> str:
    """Single-line cell safe for markdown pipe tables (no raw ``|``)."""
    s = str(cell).replace("\r\n", "\n").replace("\r", "\n")
    s = " ".join(line.strip() for line in s.split("\n") if line.strip()).strip()
    return s.replace("|", "·")


def _try_delimited_block_as_markdown_table(block: str) -> Optional[str]:
    """If ``block`` is a uniform tab- or semicolon-separated grid, return a pipe markdown table."""
    lines = [ln.strip() for ln in block.replace("\r\n", "\n").replace("\r", "\n").split("\n")]
    lines = [ln for ln in lines if ln]
    if len(lines) < 2:
        return None
    for sep in ("\t", ";"):
        rows: List[List[str]] = []
        bad = False
        for ln in lines:
            cells = [c.strip() for c in ln.split(sep)]
            if len(cells) < 2:
                bad = True
                break
            rows.append(cells)
        if bad:
            continue
        ncols = len(rows[0])
        if ncols > _MAX_AUTO_TABLE_COLS or len(rows) > _MAX_AUTO_TABLE_ROWS:
            continue
        if any(len(r) != ncols for r in rows):
            continue
        if any(len(_sanitize_cell_for_pipe_table(c)) > _MAX_AUTO_CELL_CHARS for r in rows for c in r):
            continue

        def _row_md(r: List[str]) -> str:
            return "| " + " | ".join(_sanitize_cell_for_pipe_table(c) for c in r) + " |"

        header = _row_md(rows[0])
        divider = "| " + " | ".join(["---"] * ncols) + " |"
        body = "\n".join(_row_md(r) for r in rows[1:])
        return "\n".join([header, divider, body])
    return None


def enhancePlainTextWithMarkdownTables(body: str) -> str:
    """Detect delimiter-separated grids in plain paragraphs and convert them to markdown pipe tables.

    Extractors often emit CSV-like blocks (``;`` or TAB) without markdown markers; passing those
    straight into ``markdownToDocumentJson`` produced one giant paragraph.  This pass runs only
    on whitespace-separated blocks so normal prose stays unchanged.
    """
    if not isinstance(body, str) or not body.strip():
        return body if isinstance(body, str) else ""
    chunks = re.split(r"\n\s*\n", body.strip())
    out_parts: List[str] = []
    for ch in chunks:
        ch = ch.strip()
        if not ch:
            continue
        md_table = _try_delimited_block_as_markdown_table(ch)
        out_parts.append(md_table if md_table else ch)
    return "\n\n".join(out_parts)


from modules.shared.documentUtils import parseInlineRuns  # noqa: F401 — canonical source in shared/


def _imageRefToDict(token: str) -> Optional[Dict[str, Any]]:
    """Parse one image reference line into an image content dict.

    Accepts markdown image syntax ``![alt](file:ID "200pt")`` / ``![alt](https://...)``
    or a bare ``file:ID`` / URL. Returns None for blank lines.
    """
    token = (token or "").strip()
    if not token:
        return None
    m = re.match(r"^!\[([^\]]*)\]\(([^)\"]+)(?:\s+\"(\d+)pt\")?\)\s*$", token)
    if m:
        alt = (m.group(1) or "").strip() or "Image"
        src = (m.group(2) or "").strip()
        widthStr = m.group(3)
    else:
        alt, src, widthStr = "Image", token, None
    fileId = src[5:] if src.startswith("file:") else ""
    out: Dict[str, Any] = {"altText": alt, "base64Data": "", "_fileRef": fileId, "_srcUrl": src if not fileId else ""}
    if widthStr:
        out["widthPt"] = int(widthStr)
    return out


def _parseCoverPageBlock(codeLines: List[str]) -> Dict[str, Any]:
    """Parse a ```cover_page fenced block of ``key: value`` lines.

    Recognized keys: title, subtitle, author, date, logo (``file:ID`` or URL).
    """
    content: Dict[str, Any] = {}
    for raw in codeLines:
        if ":" not in raw:
            continue
        key, _, value = raw.partition(":")
        key = key.strip().lower()
        value = value.strip()
        if not value:
            continue
        if key in ("title", "subtitle", "author", "date"):
            content[key] = value
        elif key == "logo":
            content["_fileRef"] = value[5:] if value.startswith("file:") else ""
            if not content["_fileRef"]:
                content["_srcUrl"] = value
    return content


def _parseImageGridBlock(codeLines: List[str]) -> Dict[str, Any]:
    """Parse a ```image_grid fenced block: optional ``columns: N`` plus image refs."""
    columns = 2
    images: List[Dict[str, Any]] = []
    for raw in codeLines:
        stripped = raw.strip()
        m = re.match(r"^columns\s*:\s*(\d+)\s*$", stripped, re.IGNORECASE)
        if m:
            columns = max(1, int(m.group(1)))
            continue
        img = _imageRefToDict(stripped)
        if img:
            images.append(img)
    return {"columns": columns, "images": images}


def markdownToDocumentJson(markdown: str, title: str, language: str = "de") -> Dict[str, Any]:
    """
    Convert markdown content to the standard document JSON format with Inline-Run model.
    Sections use inlineRuns (list of run dicts) instead of plain text strings.
    Supports headings, code blocks, tables, lists, images, paragraphs.
    """
    if not isinstance(markdown, str):
        markdown = str(markdown) if markdown else ""

    sections = []
    order = 0
    lines = markdown.split("\n")
    i = 0

    def _nextId():
        nonlocal order
        order += 1
        return f"s_{order}"

    while i < len(lines):
        line = lines[i]

        # Headings (plain text, no inline formatting)
        headingMatch = re.match(r"^(#{1,6})\s+(.+)", line)
        if headingMatch:
            level = len(headingMatch.group(1))
            text = headingMatch.group(2).strip()
            sections.append({
                "id": _nextId(), "content_type": "heading", "order": order,
                "elements": [{"content": {"text": text, "level": level}}],
            })
            i += 1
            continue

        # Fenced code blocks (no inline formatting)
        codeMatch = re.match(r"^```(\w*)", line)
        if codeMatch:
            lang = codeMatch.group(1) or "text"
            codeLines = []
            i += 1
            while i < len(lines) and not lines[i].startswith("```"):
                codeLines.append(lines[i])
                i += 1
            i += 1
            # Layout primitives are authored as fenced blocks with a special "language".
            if lang == "cover_page":
                sections.append({
                    "id": _nextId(), "content_type": "cover_page", "order": order,
                    "elements": [{"content": _parseCoverPageBlock(codeLines)}],
                })
                continue
            if lang == "image_grid":
                sections.append({
                    "id": _nextId(), "content_type": "image_grid", "order": order,
                    "elements": [{"content": _parseImageGridBlock(codeLines)}],
                })
                continue
            sections.append({
                "id": _nextId(), "content_type": "code_block", "order": order,
                "elements": [{"content": {"code": "\n".join(codeLines), "language": lang}}],
            })
            continue

        # Tables - cells are List[InlineRun]
        tableMatch = re.match(r"^\|(.+)\|$", line)
        if tableMatch and (i + 1) < len(lines) and re.match(r"^\|[\s\-:|]+\|$", lines[i + 1]):
            headerCells = [parseInlineRuns(c.strip()) for c in tableMatch.group(1).split("|")]
            i += 2
            rows = []
            while i < len(lines) and re.match(r"^\|(.+)\|$", lines[i]):
                rowCells = [parseInlineRuns(c.strip()) for c in lines[i][1:-1].split("|")]
                rows.append(rowCells)
                i += 1
            sections.append({
                "id": _nextId(), "content_type": "table", "order": order,
                "elements": [{"content": {"headers": headerCells, "rows": rows}}],
            })
            continue

        # Bullet / numbered lists - items are List[List[InlineRun]]
        listMatch = re.match(r"^(\s*)([-*+]|\d+[.)]) (.+)", line)
        if listMatch:
            isNumbered = bool(re.match(r"\d+[.)]", listMatch.group(2)))
            items = []
            while i < len(lines) and re.match(r"^(\s*)([-*+]|\d+[.)]) (.+)", lines[i]):
                m = re.match(r"^(\s*)([-*+]|\d+[.)]) (.+)", lines[i])
                items.append(parseInlineRuns(m.group(3).strip()))
                i += 1
            sections.append({
                "id": _nextId(), "content_type": "bullet_list", "order": order,
                "elements": [{"content": {"items": items, "list_type": "numbered" if isNumbered else "bullet"}}],
            })
            continue

        # Empty lines
        if not line.strip():
            i += 1
            continue

        # Standalone image on its own line -> block-level image section
        imgMatch = re.match(r"^!\[([^\]]*)\]\(([^)\"]+)(?:\s+\"(\d+)pt\")?\)\s*$", line)
        if imgMatch:
            altText = imgMatch.group(1).strip() or "Image"
            src = imgMatch.group(2).strip()
            widthStr = imgMatch.group(3)
            fileId = src[5:] if src.startswith("file:") else ""
            content = {
                "altText": altText,
                "base64Data": "",
                "_fileRef": fileId,
                "_srcUrl": src if not fileId else "",
            }
            if widthStr:
                content["widthPt"] = int(widthStr)
            sections.append({
                "id": _nextId(), "content_type": "image", "order": order,
                "elements": [{"content": content}],
            })
            i += 1
            continue

        # Paragraph - produces inlineRuns
        paraLines = []
        while i < len(lines) and lines[i].strip() and not re.match(
            r"^(#{1,6}\s|```|\|.+\||!\[[^\]]*\]\([^)]+\)\s*$|(\s*)([-*+]|\d+[.)]) )", lines[i]
        ):
            paraLines.append(lines[i])
            i += 1
        if paraLines:
            combinedText = " ".join(paraLines)
            sections.append({
                "id": _nextId(), "content_type": "paragraph", "order": order,
                "elements": [{"content": {"inlineRuns": parseInlineRuns(combinedText)}}],
            })
            continue

        i += 1

    if not sections:
        fallbackText = markdown.strip() or "(empty)"
        sections.append({
            "id": _nextId(), "content_type": "paragraph", "order": order,
            "elements": [{"content": {"inlineRuns": parseInlineRuns(fallbackText)}}],
        })

    return {
        "metadata": {
            "split_strategy": "single_document",
            "source_documents": [],
            "extraction_method": "file_create_rendering",
            "title": title,
            "language": language,
        },
        "documents": [{
            "id": "doc_1",
            "title": title,
            "sections": sections,
        }],
    }

def getFileExtension(fileName: str) -> str:
    """Extract file extension from fileName (without dot, lowercased)."""
    if '.' in fileName:
        return fileName.rsplit('.', 1)[-1].lower()
    return ''

def getMimeTypeFromExtension(extension: str) -> str:
    """
    Get MIME type based on file extension.
    This method consolidates MIME type detection from extension.

    Args:
        extension: File extension (with or without dot)

    Returns:
        str: MIME type for the extension
    """
    # Normalize extension (remove dot if present)
    if extension.startswith('.'):
        extension = extension[1:]

    # Map extensions to MIME types
    mime_types = {
        'txt': 'text/plain',
        'json': 'application/json',
        'xml': 'application/xml',
        'csv': 'text/csv',
        'html': 'text/html',
        'htm': 'text/html',
        'md': 'text/markdown',
        'py': 'text/x-python',
        'js': 'application/javascript',
        'css': 'text/css',
        'pdf': 'application/pdf',
        'doc': 'application/msword',
        'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
        'xls': 'application/vnd.ms-excel',
        'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
        'ppt': 'application/vnd.ms-powerpoint',
        'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
        'svg': 'image/svg+xml',
        'jpg': 'image/jpeg',
        'jpeg': 'image/jpeg',
        'png': 'image/png',
        'gif': 'image/gif',
        'bmp': 'image/bmp',
        'webp': 'image/webp',
        'zip': 'application/zip',
        'rar': 'application/x-rar-compressed',
        '7z': 'application/x-7z-compressed',
        'tar': 'application/x-tar',
        'gz': 'application/gzip'
    }
    return mime_types.get(extension.lower(), 'application/octet-stream')

def detectContentTypeFromData(fileData: bytes, fileName: str) -> str:
    """
    Detect content type from file data and fileName.
    This method makes the MIME type detection function accessible through the service center.

    Args:
        fileData: Raw file data as bytes
        fileName: Name of the file

    Returns:
        str: Detected MIME type
    """
    try:
        # Check file extension first
        ext = os.path.splitext(fileName)[1].lower()
        if ext:
            # Map common extensions to MIME types
            extToMime = {
                '.txt': 'text/plain',
                '.md': 'text/markdown',
                '.csv': 'text/csv',
                '.json': 'application/json',
                '.xml': 'application/xml',
                '.js': 'application/javascript',
                '.py': 'application/x-python',
                '.svg': 'image/svg+xml',
                '.jpg': 'image/jpeg',
                '.jpeg': 'image/jpeg',
                '.png': 'image/png',
                '.gif': 'image/gif',
                '.bmp': 'image/bmp',
                '.webp': 'image/webp',
                '.pdf': 'application/pdf',
                '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
                '.doc': 'application/msword',
                '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
                '.xls': 'application/vnd.ms-excel',
                '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
                '.ppt': 'application/vnd.ms-powerpoint',
                '.html': 'text/html',
                '.htm': 'text/html',
                '.css': 'text/css',
                '.zip': 'application/zip',
                '.rar': 'application/x-rar-compressed',
                '.7z': 'application/x-7z-compressed',
                '.tar': 'application/x-tar',
                '.gz': 'application/gzip'
            }
            if ext in extToMime:
                return extToMime[ext]

        # Try to detect from content
        if fileData.startswith(b'%PDF'):
            return 'application/pdf'
        elif fileData.startswith(b'PK\x03\x04'):
            # ZIP-based formats (docx, xlsx, pptx)
            return 'application/zip'
        elif fileData.startswith(b'<'):
            # XML-based formats
            try:
                text = fileData.decode('utf-8', errors='ignore')
                if '<svg' in text.lower():
                    return 'image/svg+xml'
                elif '<html' in text.lower():
                    return 'text/html'
                else:
                    return 'application/xml'
            except:
                pass
        elif fileData.startswith(b'\x89PNG\r\n\x1a\n'):
            return 'image/png'
        elif fileData.startswith(b'\xff\xd8\xff'):
            return 'image/jpeg'
        elif fileData.startswith(b'GIF87a') or fileData.startswith(b'GIF89a'):
            return 'image/gif'
        elif fileData.startswith(b'BM'):
            return 'image/bmp'
        elif fileData.startswith(b'RIFF') and fileData[8:12] == b'WEBP':
            return 'image/webp'

        return 'application/octet-stream'

    except Exception as e:
        logger.error(f"Error detecting content type from data: {str(e)}")
        return 'application/octet-stream'

def detectMimeTypeFromData(file_bytes: bytes, fileName: str, service=None) -> str:
    """Detect MIME type from file bytes and fileName using a service if provided."""
    try:
        if service and hasattr(service, 'detectContentTypeFromData'):
            detected = service.detectContentTypeFromData(file_bytes, fileName)
            if detected and detected != 'application/octet-stream':
                return detected
        # Fallback: use our consolidated function
        return detectContentTypeFromData(file_bytes, fileName)
    except Exception as e:
        logger.warning(f"Error in MIME type detection for {fileName}: {str(e)}")
        return 'application/octet-stream'

def detectMimeTypeFromContent(content: Any, fileName: str, service=None) -> str:
    """Detect MIME type from content and fileName using a service if provided."""
    try:
        if isinstance(content, str):
            file_bytes = content.encode('utf-8')
        elif isinstance(content, dict):
            file_bytes = json.dumps(content, ensure_ascii=False).encode('utf-8')
        else:
            file_bytes = str(content).encode('utf-8')
        return detectMimeTypeFromData(file_bytes, fileName, service)
    except Exception as e:
        logger.warning(f"Error in MIME type detection for {fileName}: {str(e)}")
        return 'application/octet-stream'

def convertDocumentDataToString(document_data: Any, file_extension: str) -> str:
    """Convert document data to string content based on file type with enhanced processing."""
    try:
        if document_data is None:
            return ""
        if isinstance(document_data, bytes):
            # WICHTIG: Decode bytes to string for text files (HTML, text, etc.)
            try:
                return document_data.decode('utf-8')
            except UnicodeDecodeError:
                # Fallback: try latin1 or return with error replacement
                try:
                    return document_data.decode('latin1')
                except Exception:
                    return document_data.decode('utf-8', errors='replace')
        if isinstance(document_data, str):
            return document_data
        if isinstance(document_data, dict):
            if file_extension == 'json':
                return json.dumps(document_data, indent=2, ensure_ascii=False)
            elif file_extension in ['txt', 'md', 'html', 'css', 'js', 'py']:
                text_fields = ['content', 'text', 'data', 'result', 'summary', 'extracted_content', 'table_data']
                for field in text_fields:
                    if field in document_data:
                        content = document_data[field]
                        if isinstance(content, str):
                            return content
                        elif isinstance(content, (dict, list)):
                            return json.dumps(content, indent=2, ensure_ascii=False)
                return json.dumps(document_data, indent=2, ensure_ascii=False)
            elif file_extension == 'csv':
                csv_fields = ['table_data', 'csv_data', 'rows', 'data', 'content', 'text']
                for field in csv_fields:
                    if field in document_data:
                        content = document_data[field]
                        if isinstance(content, str):
                            return content
                        elif isinstance(content, list):
                            if content and isinstance(content[0], (list, dict)):
                                import csv
                                output = io.StringIO()
                                if isinstance(content[0], dict):
                                    if content:
                                        fieldnames = content[0].keys()
                                        writer = csv.DictWriter(output, fieldnames=fieldnames)
                                        writer.writeheader()
                                        writer.writerows(content)
                                else:
                                    writer = csv.writer(output)
                                    writer.writerows(content)
                                return output.getvalue()
                return json.dumps(document_data, indent=2, ensure_ascii=False)
            else:
                return json.dumps(document_data, indent=2, ensure_ascii=False)
        elif isinstance(document_data, list):
            if file_extension == 'csv':
                import csv
                output = io.StringIO()
                if document_data and isinstance(document_data[0], dict):
                    fieldnames = document_data[0].keys()
                    writer = csv.DictWriter(output, fieldnames=fieldnames)
                    writer.writeheader()
                    writer.writerows(document_data)
                else:
                    writer = csv.writer(output)
                    writer.writerows(document_data)
                return output.getvalue()
            else:
                return json.dumps(document_data, indent=2, ensure_ascii=False)
        else:
            return str(document_data)
    except Exception as e:
        logger.error(f"Error converting document data to string: {str(e)}")
        return str(document_data)