feat: unified audio buffer + fragment reassembly fix

This commit is contained in:
Jose Luis Montañes Ojados
2026-01-16 14:19:02 +01:00
parent fb17813dcb
commit 8ef80530f6
6 changed files with 392 additions and 250 deletions

View File

@@ -2,10 +2,10 @@ package main
import (
"flag"
"fmt"
"log"
"os"
"os/signal"
"strings"
"sync"
"syscall"
"time"
@@ -14,35 +14,48 @@ import (
"go-ts/pkg/xai"
)
// VoiceSession represents an active xAI voice session for a user
type VoiceSession struct {
ClientID uint16
Nickname string
XAI *xai.Client
AudioBuffer []int16 // Buffer to accumulate audio samples
AudioQueue chan []int16 // Queue for sending audio with proper timing
done chan struct{} // Signal to stop audio sender
// UserInfo tracks connected users (no individual sessions)
type UserInfo struct {
ClientID uint16
Nickname string
}
// Bot manages the TeamSpeak connection and xAI sessions
// Bot manages the TeamSpeak connection and single global xAI session
type Bot struct {
ts3 *ts3client.Client
apiKey string
voice string
prompt string
greeting string // Optional greeting when user joins
greeting string
selfID uint16 // Our own ClientID
sessions map[uint16]*VoiceSession
sessionsMu sync.RWMutex
startTime time.Time
selfID uint16
users map[uint16]*UserInfo
usersMu sync.RWMutex
startTime time.Time
// Global xAI Session (one for all users)
globalXAI *xai.Client
globalMu sync.Mutex
// Input audio mixing (multiple users → one stream)
inputBuffer []int16
inputMu sync.Mutex
lastInputTime time.Time
// Output audio (xAI response → TeamSpeak)
outputQueue chan []int16
outputBuffer []int16
outputMu sync.Mutex
done chan struct{}
}
func main() {
serverAddr := flag.String("server", "127.0.0.1:9987", "TeamSpeak 3 Server Address")
nickname := flag.String("nickname", "GrokBot", "Bot nickname")
voice := flag.String("voice", xai.VoiceAra, "xAI voice (Ara, Rex, Sal, Eve, Leo)")
greeting := flag.String("greeting", "Saluda brevemente al usuario que acaba de unirse.", "Greeting message (empty to disable)")
greeting := flag.String("greeting", "", "Greeting message when users join (empty to disable)")
room := flag.String("room", "", "Channel name to join after connecting (empty = stay in default)")
flag.Parse()
apiKey := os.Getenv("XAI_API_KEY")
@@ -50,18 +63,21 @@ func main() {
log.Fatal("XAI_API_KEY environment variable not set")
}
log.Println("=== xAI Voice Bot for TeamSpeak ===")
log.Println("=== xAI Voice Bot for TeamSpeak (Unified Session) ===")
log.Printf("Server: %s", *serverAddr)
log.Printf("Nickname: %s", *nickname)
log.Printf("Voice: %s", *voice)
bot := &Bot{
apiKey: apiKey,
voice: *voice,
prompt: "Eres Grok, un asistente de voz amigable y útil. Responde de forma concisa y natural.",
greeting: *greeting,
sessions: make(map[uint16]*VoiceSession),
startTime: time.Now(),
apiKey: apiKey,
voice: *voice,
prompt: "Eres Grok, un asistente de voz amigable y útil en un canal de TeamSpeak. Puedes escuchar a múltiples personas hablando. Responde de forma concisa y natural. Si varias personas hablan, trata de entender el contexto de la conversación grupal.",
greeting: *greeting,
users: make(map[uint16]*UserInfo),
startTime: time.Now(),
inputBuffer: make([]int16, 0, 960*50), // ~1 second buffer
outputQueue: make(chan []int16, 500), // ~10 seconds of audio
done: make(chan struct{}),
}
// Create TeamSpeak client
@@ -71,44 +87,81 @@ func main() {
// Register event handlers
bot.ts3.On(ts3client.EventConnected, func(e *ts3client.ConnectedEvent) {
bot.selfID = e.ClientID // Store our own ID
bot.selfID = e.ClientID
log.Printf("✓ Conectado a TeamSpeak! ClientID=%d, Server=%s", e.ClientID, e.ServerName)
// Initialize global xAI session after connecting
go func() {
if err := bot.initGlobalSession(); err != nil {
log.Printf("[Global] Error iniciando sesión xAI: %v", err)
}
}()
})
bot.ts3.On(ts3client.EventChannelList, func(e *ts3client.ChannelListEvent) {
log.Printf("✓ %d canales disponibles", len(e.Channels))
// Join specified room if provided
if *room != "" {
go func() {
// Small delay to ensure connection is fully established
time.Sleep(500 * time.Millisecond)
ch := bot.ts3.GetChannelByName(*room)
if ch != nil {
log.Printf("[Room] Uniéndose al canal: %s (ID=%d)", ch.Name, ch.ID)
if err := bot.ts3.JoinChannel(ch.ID); err != nil {
log.Printf("[Room] Error al unirse al canal: %v", err)
} else {
log.Printf("[Room] ✓ Unido al canal: %s", ch.Name)
}
} else {
log.Printf("[Room] ⚠ Canal no encontrado: %s", *room)
}
}()
}
})
bot.ts3.On(ts3client.EventClientEnter, func(e *ts3client.ClientEnterEvent) {
log.Printf("→ Usuario entró: %s (ID=%d)", e.Nickname, e.ClientID)
// Don't create session for ourselves (compare by ID, not nickname)
// Don't track ourselves
if e.ClientID == bot.selfID {
log.Printf(" (Soy yo, ignorando)")
return
}
// Create xAI session for this user
go bot.createSession(e.ClientID, e.Nickname, bot.greeting)
log.Printf("→ Usuario entró: %s (ID=%d)", e.Nickname, e.ClientID)
bot.usersMu.Lock()
bot.users[e.ClientID] = &UserInfo{
ClientID: e.ClientID,
Nickname: e.Nickname,
}
bot.usersMu.Unlock()
// Notify xAI about new user (if past startup grace period)
if bot.greeting != "" && time.Since(bot.startTime) > 3*time.Second {
bot.globalMu.Lock()
if bot.globalXAI != nil && bot.globalXAI.IsConnected() {
msg := fmt.Sprintf("%s. El usuario %s acaba de unirse al canal.", bot.greeting, e.Nickname)
if err := bot.globalXAI.SendText(msg); err != nil {
log.Printf("[Global] Error enviando notificación: %v", err)
}
}
bot.globalMu.Unlock()
}
})
bot.ts3.On(ts3client.EventClientLeft, func(e *ts3client.ClientLeftEvent) {
log.Printf("← Usuario salió: ID=%d (%s)", e.ClientID, e.Reason)
// Close xAI session for this user
bot.closeSession(e.ClientID)
bot.usersMu.Lock()
if user, ok := bot.users[e.ClientID]; ok {
log.Printf("← Usuario salió: %s (ID=%d, %s)", user.Nickname, e.ClientID, e.Reason)
delete(bot.users, e.ClientID)
}
bot.usersMu.Unlock()
})
// Audio handler: Mix ALL incoming audio into unified buffer
bot.ts3.On(ts3client.EventAudio, func(e *ts3client.AudioEvent) {
// Forward audio from TeamSpeak to all xAI sessions
// Forward audio ONLY to the sender's session
bot.sessionsMu.RLock()
if session, ok := bot.sessions[e.SenderID]; ok {
if session.XAI != nil && session.XAI.IsConnected() {
session.XAI.SendAudio(e.PCM)
}
}
bot.sessionsMu.RUnlock()
bot.handleInputAudio(e.SenderID, e.PCM)
})
bot.ts3.On(ts3client.EventError, func(e *ts3client.ErrorEvent) {
@@ -125,222 +178,176 @@ func main() {
<-sigChan
log.Println("Cerrando...")
// Close all xAI sessions and audio senders first
bot.sessionsMu.Lock()
for _, session := range bot.sessions {
// Close audio sender first
select {
case <-session.done:
// Already closed
default:
close(session.done)
}
// Then close xAI
if session.XAI != nil {
session.XAI.Close()
}
}
bot.sessionsMu.Unlock()
// Signal all goroutines to stop
close(bot.done)
// Wait for audio senders to stop
// Close global xAI session
bot.globalMu.Lock()
if bot.globalXAI != nil {
bot.globalXAI.Close()
}
bot.globalMu.Unlock()
// Wait for goroutines
time.Sleep(200 * time.Millisecond)
// Now disconnect from TeamSpeak
// Disconnect from TeamSpeak
bot.ts3.Disconnect()
// Signal main that we are done
close(shutdownDone)
}()
// Start global audio mixer
go bot.runAudioMixer(shutdownDone)
// Start input sender (sends mixed audio to xAI)
go bot.runInputSender()
// Start output mixer (sends xAI audio to TeamSpeak)
go bot.runOutputSender()
// Connect to TeamSpeak
if err := bot.ts3.Connect(); err != nil {
// If connect returns error, check if it's because we're shutting down
select {
case <-shutdownDone:
// Normal shutdown
log.Println("Conexión cerrada por shutdown")
default:
log.Fatalf("Error de conexión: %v", err)
}
}
// Wait for shutdown to complete if we returned from Connect cleanly
log.Println("Esperando confirmación final de shutdown...")
<-shutdownDone
log.Println("Shutdown completado. Saliendo.")
os.Exit(0)
}
// createSession creates a new xAI voice session for a user
func (b *Bot) createSession(clientID uint16, nickname string, greeting string) {
log.Printf("[Session] Creando sesión xAI para %s...", nickname)
// Create session with audio queue
session := &VoiceSession{
ClientID: clientID,
Nickname: nickname,
AudioBuffer: make([]int16, 0, 960*10),
AudioQueue: make(chan []int16, 500), // Buffer up to 500 frames (~10 sec)
done: make(chan struct{}),
}
// Start audio sender goroutine with proper 20ms timing
// Global audio mixer handles sending
// go b.audioSender(session)
// initGlobalSession creates the single xAI session for all users
func (b *Bot) initGlobalSession() error {
log.Println("[Global] Iniciando sesión xAI global...")
xaiClient := xai.New(b.apiKey)
// Set up audio callback - buffer and queue in 960-sample chunks
// Handle output audio from xAI → buffer for TeamSpeak
xaiClient.OnAudio(func(pcm []int16) {
b.sessionsMu.Lock()
session.AudioBuffer = append(session.AudioBuffer, pcm...)
b.outputMu.Lock()
b.outputBuffer = append(b.outputBuffer, pcm...)
// Queue complete 960-sample frames
for len(session.AudioBuffer) >= 960 {
for len(b.outputBuffer) >= 960 {
frame := make([]int16, 960)
copy(frame, session.AudioBuffer[:960])
session.AudioBuffer = session.AudioBuffer[960:]
copy(frame, b.outputBuffer[:960])
b.outputBuffer = b.outputBuffer[960:]
// Non-blocking send to queue
select {
case session.AudioQueue <- frame:
case b.outputQueue <- frame:
default:
// Queue full, drop frame
// Queue full, drop oldest
}
}
b.sessionsMu.Unlock()
b.outputMu.Unlock()
})
// Set up transcript callback for logging
// Log transcripts
xaiClient.OnTranscript(func(text string) {
log.Printf("[Grok] %s", text)
})
// Clear audio queue when user starts speaking (interruption)
xaiClient.OnSpeechStarted(func() {
// Disable queue clearing for now to prevent cutting off greetings due to sensitive VAD
/*
b.sessionsMu.Lock()
// Clear the buffer
session.AudioBuffer = session.AudioBuffer[:0]
// Drain the queue
for len(session.AudioQueue) > 0 {
<-session.AudioQueue
}
b.sessionsMu.Unlock()
*/
log.Printf("[Session] Speech started by %s (VAD) - Ignoring interruption to ensure playback", nickname)
})
// Connect to xAI
if err := xaiClient.Connect(); err != nil {
log.Printf("[Session] Error conectando a xAI: %v", err)
close(session.done)
return
return fmt.Errorf("connect: %w", err)
}
// Configure the session
if err := xaiClient.ConfigureSession(b.voice, b.prompt); err != nil {
log.Printf("[Session] Error configurando sesión: %v", err)
xaiClient.Close()
close(session.done)
return
return fmt.Errorf("configure: %w", err)
}
// Store the xAI client in session
session.XAI = xaiClient
b.globalMu.Lock()
b.globalXAI = xaiClient
b.globalMu.Unlock()
b.sessionsMu.Lock()
b.sessions[clientID] = session
b.sessionsMu.Unlock()
log.Printf("[Session] ✓ Sesión xAI activa para %s", nickname)
// Send greeting to start conversation (if configured)
// Send greeting to start conversation (if configured)
if strings.TrimSpace(greeting) != "" {
// Only greet if we are past the startup grace period (3 seconds)
// This prevents "Greeting Storm" when joining a channel with existing users
if time.Since(b.startTime) > 3*time.Second {
go func() {
time.Sleep(500 * time.Millisecond) // Small delay for session to stabilize
if err := xaiClient.SendText(greeting); err != nil {
log.Printf("[Session] Error enviando saludo: %v", err)
}
}()
} else {
log.Printf("[Session] Omitiendo saludo inicial para %s (sesión existente detectada en arranque)", nickname)
}
}
log.Println("[Global] ✓ Sesión xAI global activa")
return nil
}
// runAudioMixer mixes audio from all active sessions and sends it to TeamSpeak
func (b *Bot) runAudioMixer(stop <-chan struct{}) {
// handleInputAudio mixes incoming audio from any user into the unified buffer
func (b *Bot) handleInputAudio(senderID uint16, pcm []int16) {
b.inputMu.Lock()
defer b.inputMu.Unlock()
// Extend buffer if needed
neededLen := len(pcm)
currentLen := len(b.inputBuffer)
if currentLen < neededLen {
// Extend with zeros
b.inputBuffer = append(b.inputBuffer, make([]int16, neededLen-currentLen)...)
}
// Mix (add with clipping protection)
for i, sample := range pcm {
val := int32(b.inputBuffer[i]) + int32(sample)
if val > 32767 {
val = 32767
}
if val < -32768 {
val = -32768
}
b.inputBuffer[i] = int16(val)
}
b.lastInputTime = time.Now()
}
// runInputSender sends buffered audio to xAI every 20ms
func (b *Bot) runInputSender() {
ticker := time.NewTicker(20 * time.Millisecond)
defer ticker.Stop()
mixedFrame := make([]int16, 960)
for {
select {
case <-b.done:
return
case <-ticker.C:
b.inputMu.Lock()
if len(b.inputBuffer) >= 960 {
// Extract one frame
frame := make([]int16, 960)
copy(frame, b.inputBuffer[:960])
// Shift buffer (remove consumed samples)
b.inputBuffer = b.inputBuffer[960:]
b.inputMu.Unlock()
// Send to global xAI session
b.globalMu.Lock()
if b.globalXAI != nil && b.globalXAI.IsConnected() {
b.globalXAI.SendAudio(frame)
}
b.globalMu.Unlock()
} else {
b.inputMu.Unlock()
}
}
}
}
// runOutputSender sends xAI audio responses to TeamSpeak with proper timing
func (b *Bot) runOutputSender() {
ticker := time.NewTicker(20 * time.Millisecond)
defer ticker.Stop()
for {
select {
case <-stop:
case <-b.done:
return
case <-ticker.C:
hasAudio := false
// Zero output buffer
for i := range mixedFrame {
mixedFrame[i] = 0
}
b.sessionsMu.RLock()
for _, session := range b.sessions {
// Try to get a frame from the queue
select {
case frame := <-session.AudioQueue:
hasAudio = true
// Mix (Sum and Clamp)
for i := 0; i < 960; i++ {
if i >= len(frame) {
break
}
val := int32(mixedFrame[i]) + int32(frame[i])
if val > 32767 {
val = 32767
}
if val < -32768 {
val = -32768
}
mixedFrame[i] = int16(val)
}
default:
// No audio from this session
}
}
b.sessionsMu.RUnlock()
if hasAudio {
if err := b.ts3.SendAudio(mixedFrame); err != nil {
log.Printf("[Mixer] Error sending audio: %v", err)
select {
case frame := <-b.outputQueue:
if err := b.ts3.SendAudio(frame); err != nil {
log.Printf("[Output] Error sending audio: %v", err)
}
default:
// No audio to send
}
}
}
}
// closeSession closes an xAI session for a user
func (b *Bot) closeSession(clientID uint16) {
b.sessionsMu.Lock()
defer b.sessionsMu.Unlock()
if session, ok := b.sessions[clientID]; ok {
log.Printf("[Session] Cerrando sesión xAI para %s", session.Nickname)
if session.XAI != nil {
session.XAI.Close()
}
delete(b.sessions, clientID)
}
}