gateway/modules/services/serviceExtraction/chunking/text_chunker.py
2025-10-03 11:23:48 +02:00

33 lines
1.2 KiB
Python

from typing import Any, Dict, List
import logging
from modules.datamodels.datamodelExtraction import ContentPart
from ..subRegistry import Chunker
logger = logging.getLogger(__name__)
class TextChunker(Chunker):
def chunk(self, part: ContentPart, options: Dict[str, Any]) -> list[Dict[str, Any]]:
maxBytes = int(options.get("textChunkSize", 40000))
logger.debug(f"TextChunker: textChunkSize from options: {options.get('textChunkSize', 'NOT_FOUND')}")
logger.debug(f"TextChunker: using maxBytes: {maxBytes}")
chunks: List[Dict[str, Any]] = []
current: List[str] = []
size = 0
for line in part.data.split('\n'):
lineSize = len(line.encode('utf-8')) + 1
if size + lineSize > maxBytes and current:
data = '\n'.join(current)
chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
current = [line]
size = lineSize
else:
current.append(line)
size += lineSize
if current:
data = '\n'.join(current)
chunks.append({"data": data, "size": len(data.encode('utf-8')), "order": len(chunks)})
return chunks