gateway/modules/services/serviceExtraction/chunking/text_chunker.py

from typing import Any, Dict, List

from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Chunker


class TextChunker(Chunker):
    def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
        maxBytes = int(options.get("textChunkSize", 40000))
        import logging
        logger = logging.getLogger(__name__)
        logger.debug(f"TextChunker: textChunkSize from options: {options.get('textChunkSize', 'NOT_FOUND')}")
        logger.debug(f"TextChunker: using maxBytes: {maxBytes}")
        chunks: List[Dict[str, Any]] = []
        current: List[str] = []
        size = 0
        for line in part.data.split('\n'):
            lineSize = len(line.encode('utf-8')) + 1
            if size + lineSize > maxBytes and current:
                data = '\n'.join(current)
                chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
                current = [line]
                size = lineSize
            else:
                current.append(line)
                size += lineSize
        if current:
            data = '\n'.join(current)
            chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
        return chunks