Add gpt-4o-mini-tts support with instructions for emotional TTS (scream mode)

+2

output/.gitignore

··· 1 + * 2 + !.gitignore

+33 -22

system/netlify/functions/say.js

··· 6 6 const crypto = require("crypto"); 7 7 const { S3Client, HeadObjectCommand, PutObjectCommand } = require("@aws-sdk/client-s3"); 8 8 9 - // OpenAI voice mapping: alloy, echo, fable, onyx, nova, shimmer 9 + // OpenAI voice mapping 10 + // gpt-4o-mini-tts voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse 10 11 const OPENAI_VOICES = { 11 - male: ["onyx", "echo", "alloy"], 12 - female: ["nova", "shimmer", "fable"], 13 - neutral: ["alloy", "fable", "echo"], 12 + male: ["onyx", "echo", "ash", "alloy"], 13 + female: ["nova", "shimmer", "fable", "coral"], 14 + neutral: ["alloy", "fable", "echo", "sage", "verse", "ballad"], 14 15 }; 15 16 16 17 // Initialize S3 client for Digital Ocean Spaces ··· 27 28 const CDN_URL = "https://art.aesthetic.computer"; 28 29 const CACHE_PREFIX = "tts-cache/"; 29 30 30 - // Generate cache key from provider + voice + text 31 - function getCacheKey(provider, voiceId, text) { 32 - const hash = crypto.createHash("sha256").update(`${provider}:${voiceId}:${text}`).digest("hex"); 31 + // Generate cache key from provider + voice + text + instructions 32 + function getCacheKey(provider, voiceId, text, instructions) { 33 + const parts = `${provider}:${voiceId}:${text}${instructions ? `:${instructions}` : ""}`; 34 + const hash = crypto.createHash("sha256").update(parts).digest("hex"); 33 35 return `${CACHE_PREFIX}${hash}.mp3`; 34 36 } 35 37 ··· 67 69 } 68 70 69 71 // Generate audio with OpenAI TTS 70 - async function generateOpenAI(text, gender, set) { 72 + // Uses gpt-4o-mini-tts when instructions are provided (supports emotional/style control), 73 + // falls back to tts-1 otherwise. 74 + async function generateOpenAI(text, gender, set, instructions) { 71 75 const voiceList = OPENAI_VOICES[gender] || OPENAI_VOICES.neutral; 72 76 const voice = voiceList[set % voiceList.length]; 73 - 77 + 74 78 const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY }); 75 - 76 - const mp3Response = await openai.audio.speech.create({ 77 - model: "tts-1", 79 + 80 + const params = { 81 + model: instructions ? "gpt-4o-mini-tts" : "tts-1", 78 82 voice: voice, 79 83 input: text, 80 84 response_format: "mp3", 81 - }); 85 + }; 86 + 87 + if (instructions) params.instructions = instructions; 88 + 89 + const mp3Response = await openai.audio.speech.create(params); 82 90 83 91 return { 84 92 buffer: Buffer.from(await mp3Response.arrayBuffer()), ··· 154 162 const utterance = body.from || "aesthetic.computer"; 155 163 const set = parseInt(body.voice?.split(":")[1]) || 0; 156 164 const gender = body.voice?.split(":")[0]?.toLowerCase() || "neutral"; 157 - 165 + 158 166 // Provider: "openai" (default), "google" 159 167 // Can be set via body.provider or defaults to openai 160 168 const provider = body.provider || "openai"; 161 - 169 + 170 + // Instructions for gpt-4o-mini-tts emotional/style control (OpenAI only) 171 + const instructions = provider === "openai" ? (body.instructions || null) : null; 172 + 162 173 // Cache bust: if true, skip cache lookup and regenerate 163 174 const bustCache = body.bust === true; 164 175 165 176 // Check for SSML (only Google supports it) 166 177 const isSSML = utterance.indexOf("<speak>") !== -1; 167 - 178 + 168 179 // Strip SSML tags for OpenAI (it doesn't support them) 169 180 let text = utterance; 170 181 if (isSSML && provider === "openai") { ··· 173 184 174 185 // Build voice identifier for cache key 175 186 const voiceSpec = `${provider}-${gender}-${set}`; 176 - const cacheKey = getCacheKey(provider, voiceSpec, text); 187 + const cacheKey = getCacheKey(provider, voiceSpec, text, instructions); 177 188 178 189 try { 179 190 // Check cache first - return redirect to CDN if cached (unless bust=true) 180 191 if (!bustCache) { 181 192 const cachedUrl = await checkCache(cacheKey); 182 - 193 + 183 194 if (cachedUrl) { 184 195 console.log(`🎯 TTS cache hit: ${cachedUrl}`); 185 196 return { ··· 197 208 } 198 209 199 210 // Cache miss (or bust) - generate with selected provider 200 - console.log(`🔄 TTS ${bustCache ? 'regenerating' : 'cache miss'} (${provider}): ${text.substring(0, 50)}...`); 201 - 211 + console.log(`🔄 TTS ${bustCache ? "regenerating" : "cache miss"} (${provider}): ${text.substring(0, 50)}...`); 212 + 202 213 let result; 203 214 if (provider === "google") { 204 215 result = await generateGoogle(text, gender, set, isSSML); 205 216 } else { 206 - result = await generateOpenAI(text, gender, set); 217 + result = await generateOpenAI(text, gender, set, instructions); 207 218 } 208 219 209 220 const { buffer: audioBuffer, voiceId } = result; ··· 220 231 221 232 // Cache for next time 222 233 const cdnUrl = await saveToCache(cacheKey, audioBuffer); 223 - 234 + 224 235 if (cdnUrl) { 225 236 return { 226 237 statusCode: 302,

+10 -3

system/public/aesthetic.computer/disks/say.mjs

··· 16 16 let status = "idle"; // idle, speaking, error 17 17 let provider = "openai"; // "openai" or "google" 18 18 let gender = "neutral"; 19 + let instructions = null; 19 20 20 21 // 🥾 Boot 21 22 function boot({ params, colon }) { ··· 33 34 else if (part === "openai") provider = "openai"; 34 35 else if (part === "male") gender = "male"; 35 36 else if (part === "female") gender = "female"; 37 + else if (part === "scream") { 38 + provider = "openai"; 39 + instructions = "Deliver this as a blood-curdling scream. Shriek at the absolute top of your lungs with your voice cracking. Pure primal rage. Do NOT speak normally — only scream, raw and unhinged."; 40 + } 36 41 } 37 42 console.log(`Provider: ${provider}, Gender: ${gender}`); 38 43 } ··· 45 50 // Note: Top-left corner is reserved for prompt HUD label 46 51 47 52 // Provider indicator (below HUD area) 48 - const providerColor = provider === "google" ? "cyan" : "lime"; 49 - ink(providerColor).write(`[${provider}]`, { x: 6, y: 18 }); 53 + const providerColor = instructions ? "red" : provider === "google" ? "cyan" : "lime"; 54 + const providerLabel = instructions ? `[${provider} SCREAM]` : `[${provider}]`; 55 + ink(providerColor).write(providerLabel, { x: 6, y: 18 }); 50 56 51 57 // Instructions 52 58 ink("gray").write("say <words>", { x: 6, y: 32 }); ··· 80 86 81 87 const voice = `${gender}:0`; 82 88 83 - console.log(`🗣️ Speaking: "${text}" with ${provider}, voice: ${voice}`); 89 + console.log(`🗣️ Speaking: "${text}" with ${provider}, voice: ${voice}${instructions ? " [SCREAM]" : ""}`); 84 90 speak(text, voice, "cloud", { 85 91 volume: 1, 86 92 provider: provider, 93 + instructions, 87 94 }); 88 95 } 89 96

+4 -1

system/public/aesthetic.computer/lib/speech.mjs

··· 68 68 69 69 synth.speak(utterance); 70 70 } else if (mode === "cloud") { 71 - const label = `speech:${voice}:${opts.provider || "openai"} - ${words}`; 71 + const instTag = opts.instructions ? `:${opts.instructions.slice(0, 32)}` : ""; 72 + const label = `speech:${voice}:${opts.provider || "openai"}${instTag} - ${words}`; 72 73 73 74 // For preload-only mode, return a promise that resolves when cached 74 75 let preloadResolve = null; ··· 198 199 provider: opts.provider || "openai", // "openai" (default) or "google" 199 200 bust: needsBust, // Force regenerate on server if marked 200 201 }; 202 + 203 + if (opts.instructions) payload.instructions = opts.instructions; 201 204 202 205 // Create a promise that resolves when the fetch completes 203 206 let fetchResolve;

Configure Feed

Configure Feed