fixes stt paras
This commit is contained in:
parent
16ab816c65
commit
e3284994d0
1 changed files with 60 additions and 16 deletions
|
|
@ -602,6 +602,13 @@ class TeamsbotService:
|
|||
self._lastTranscriptText: Optional[str] = None
|
||||
self._lastTranscriptId: Optional[str] = None
|
||||
self._lastSttTime: float = 0.0
|
||||
|
||||
# Audio chunk aggregation: collect chunks and send to STT only
|
||||
# after a speech pause or when the buffer reaches a target duration.
|
||||
self._audioBuffer: bytes = b""
|
||||
self._audioBufferStartTime: float = 0.0
|
||||
self._audioBufferLastChunkTime: float = 0.0
|
||||
self._audioBufferSampleRate: int = 16000
|
||||
self._lastBotResponseText: Optional[str] = None
|
||||
self._lastBotResponseTs: float = 0.0
|
||||
|
||||
|
|
@ -1203,6 +1210,14 @@ class TeamsbotService:
|
|||
interface.updateSession(sessionId, updates)
|
||||
await _emitSessionEvent(sessionId, "statusChange", {"status": status, "errorMessage": errorMessage})
|
||||
|
||||
# Flush remaining audio buffer before generating summary
|
||||
if dbStatus in [TeamsbotSessionStatus.ENDED.value, TeamsbotSessionStatus.ERROR.value]:
|
||||
if self._audioBuffer:
|
||||
logger.info(f"[AudioChunk] Flushing remaining buffer on session end ({len(self._audioBuffer)} bytes)")
|
||||
self._audioBuffer = b""
|
||||
self._audioBufferStartTime = 0.0
|
||||
self._audioBufferLastChunkTime = 0.0
|
||||
|
||||
# Generate summary when session ends
|
||||
if dbStatus == TeamsbotSessionStatus.ENDED.value:
|
||||
asyncio.create_task(self._generateMeetingSummary(sessionId))
|
||||
|
|
@ -1217,11 +1232,18 @@ class TeamsbotService:
|
|||
voiceInterface,
|
||||
websocket: WebSocket,
|
||||
):
|
||||
"""Process an audio chunk from WebRTC capture — run STT and feed into transcript pipeline."""
|
||||
"""Process an audio chunk from WebRTC capture. The bot-side VAD
|
||||
(AudioWorklet / ScriptProcessor) already segments speech into 1-8s
|
||||
voiced chunks. Here we apply a minimum-duration safety net: very short
|
||||
chunks (<1s) are buffered until they reach 1s; everything else goes
|
||||
straight to STT. A wall-clock timeout flushes stale buffers."""
|
||||
import base64
|
||||
_MIN_CHUNK_SEC = 1.0
|
||||
_STALE_TIMEOUT_SEC = 3.0
|
||||
|
||||
try:
|
||||
audioBytes = base64.b64decode(audioBase64)
|
||||
if len(audioBytes) < 1000:
|
||||
if len(audioBytes) < 500:
|
||||
return
|
||||
|
||||
if captureDiagnostics:
|
||||
|
|
@ -1234,14 +1256,12 @@ class TeamsbotService:
|
|||
f"rms={rms}, nativeRate={nativeSampleRate}, bytes={len(audioBytes)}"
|
||||
)
|
||||
|
||||
# Use RMS from capture diagnostics to skip real silence.
|
||||
# Byte-variation heuristics produced false positives and dropped valid speech.
|
||||
isSilent = False
|
||||
if captureDiagnostics and captureDiagnostics.get("rms") is not None:
|
||||
try:
|
||||
rmsVal = float(captureDiagnostics.get("rms"))
|
||||
if rmsVal < 0.0003:
|
||||
logger.debug(f"[AudioChunk] Skipping silent audio ({len(audioBytes)} bytes, rms={rmsVal:.6f})")
|
||||
return
|
||||
isSilent = True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
|
@ -1249,23 +1269,47 @@ class TeamsbotService:
|
|||
logger.warning(f"[AudioChunk] No voice interface available for session {sessionId}")
|
||||
return
|
||||
|
||||
# Treat sampleRate=0 as unknown (triggers auto-detection)
|
||||
effectiveSampleRate = sampleRate if sampleRate and sampleRate > 0 else None
|
||||
now = time.time()
|
||||
effectiveRate = sampleRate if sampleRate and sampleRate > 0 else 16000
|
||||
|
||||
if not isSilent:
|
||||
if not self._audioBuffer:
|
||||
self._audioBufferStartTime = now
|
||||
self._audioBuffer += audioBytes
|
||||
self._audioBufferLastChunkTime = now
|
||||
self._audioBufferSampleRate = effectiveRate
|
||||
|
||||
bufferDuration = len(self._audioBuffer) / (effectiveRate * 2) if self._audioBuffer else 0.0
|
||||
bufferAge = (now - self._audioBufferStartTime) if self._audioBuffer else 0.0
|
||||
|
||||
shouldFlush = (
|
||||
self._audioBuffer
|
||||
and (
|
||||
bufferDuration >= _MIN_CHUNK_SEC
|
||||
or (bufferAge >= _STALE_TIMEOUT_SEC and bufferDuration > 0.3)
|
||||
)
|
||||
)
|
||||
|
||||
if not shouldFlush:
|
||||
return
|
||||
|
||||
flushBytes = self._audioBuffer
|
||||
flushRate = self._audioBufferSampleRate
|
||||
self._audioBuffer = b""
|
||||
self._audioBufferStartTime = 0.0
|
||||
self._audioBufferLastChunkTime = 0.0
|
||||
|
||||
flushDuration = len(flushBytes) / (flushRate * 2)
|
||||
logger.info(f"[AudioChunk] Flushing buffer: {len(flushBytes)} bytes, {flushDuration:.1f}s, {flushRate}Hz")
|
||||
|
||||
phraseHints = list(self._knownSpeakers)
|
||||
if self.config.botName:
|
||||
phraseHints.append(self.config.botName)
|
||||
|
||||
# Sprache kommt ausschliesslich aus der Session/Instance-Konfig
|
||||
# (TeamsbotUserSettings.language ueberschreibt
|
||||
# TeamsbotConfig.language, Fallback de-DE im Schema).
|
||||
# KEIN hardcodierter Alternative-Sprachen-Pool — der hat dafuer
|
||||
# gesorgt, dass Google STT bei verrauschter Audio auf en-US
|
||||
# gesprungen ist und englisches Kauderwelsch geliefert hat.
|
||||
sttResult = await voiceInterface.speechToText(
|
||||
audioContent=audioBytes,
|
||||
audioContent=flushBytes,
|
||||
language=self.config.language or "de-DE",
|
||||
sampleRate=effectiveSampleRate,
|
||||
sampleRate=flushRate,
|
||||
channels=1,
|
||||
skipFallbacks=True,
|
||||
phraseHints=phraseHints if phraseHints else None,
|
||||
|
|
|
|||
Loading…
Reference in a new issue