import crypto from 'node:crypto'; import fs from 'node:fs'; import path from 'node:path'; import cors from 'cors'; import express from 'express'; import { MsEdgeTTS, OUTPUT_FORMAT } from 'msedge-tts'; const app = express(); const port = Number(process.env.TTS_PORT ?? 3333); const host = process.env.TTS_HOST ?? '0.0.0.0'; const cacheDir = path.resolve(process.cwd(), '.tts-cache'); fs.mkdirSync(cacheDir, { recursive: true }); app.use(cors()); app.use(express.json({ limit: '64kb' })); app.use('/audio', express.static(cacheDir, { maxAge: '1h' })); app.get('/health', (_request, response) => { response.json({ ok: true, voice: getVoiceFromEnv() }); }); app.get('/voices', async (_request, response) => { const tts = new MsEdgeTTS({}); const voices = await tts.getVoices(); response.json( voices .filter((voice) => voice.Locale.startsWith('en-')) .map((voice) => ({ name: voice.ShortName, friendlyName: voice.FriendlyName, gender: voice.Gender, locale: voice.Locale, })), ); }); app.post('/tts', async (request, response) => { const text = sanitizeText(String(request.body?.text ?? '')); if (!text) { response.status(400).json({ error: 'Missing text' }); return; } const voice = String(request.body?.voice ?? getVoiceFromEnv()); const rate = String(request.body?.rate ?? process.env.TTS_RATE ?? '-8%'); const pitch = String(request.body?.pitch ?? process.env.TTS_PITCH ?? '+0Hz'); const hash = crypto.createHash('sha256').update(`${voice}|${rate}|${pitch}|${text}`).digest('hex'); const filename = `${hash}.mp3`; const filePath = path.join(cacheDir, filename); if (!fs.existsSync(filePath)) { const tts = new MsEdgeTTS({}); await tts.setMetadata(voice, OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3); const { audioFilePath, metadataFilePath } = await tts.toFile(cacheDir, escapeXml(text), { rate, pitch }); fs.renameSync(audioFilePath, filePath); if (metadataFilePath && fs.existsSync(metadataFilePath)) { fs.unlinkSync(metadataFilePath); } tts.close(); } response.json({ audioUrl: `${request.protocol}://${request.get('host')}/audio/${filename}` }); }); app.listen(port, host, () => { console.log(`TTS server listening on http://${host}:${port}`); console.log(`Voice: ${getVoiceFromEnv()}`); }); function getVoiceFromEnv() { return process.env.TTS_VOICE ?? 'en-US-JennyNeural'; } function sanitizeText(value) { return value .replace(/[\s\S]*?<\/think>/gi, '') .replace(/```[\s\S]*?```/g, ' ') .replace(/`([^`]+)`/g, '$1') .replace(/\*\*([^*]+)\*\*/g, '$1') .replace(/[*_#>~]/g, '') .replace(/https?:\/\/\S+/g, '') .replace(/\s+/g, ' ') .trim() .slice(0, 1800); } function escapeXml(value) { return value .replace(/&/g, '&') .replace(//g, '>') .replace(/"/g, '"') .replace(/'/g, '''); }