Use laptop MP3 TTS playback
This commit is contained in:
@@ -1,7 +1,6 @@
|
|||||||
EXPO_PUBLIC_OLLAMA_BASE_URL=http://localhost:11434
|
EXPO_PUBLIC_OLLAMA_BASE_URL=http://localhost:11434
|
||||||
EXPO_PUBLIC_OLLAMA_MODEL=llama3.2
|
EXPO_PUBLIC_OLLAMA_MODEL=llama3.2
|
||||||
EXPO_PUBLIC_SPEECH_LANGUAGE=en-US
|
EXPO_PUBLIC_TTS_BASE_URL=http://localhost:3333
|
||||||
EXPO_PUBLIC_SPEECH_RATE=0.85
|
EXPO_PUBLIC_TTS_VOICE=en-US-JennyNeural
|
||||||
EXPO_PUBLIC_SPEECH_PITCH=1
|
EXPO_PUBLIC_TTS_RATE=0.88
|
||||||
# Optional iOS voice identifier. Leave empty to use the first matching iPhone voice.
|
EXPO_PUBLIC_TTS_PITCH=+0Hz
|
||||||
# EXPO_PUBLIC_SPEECH_VOICE=com.apple.voice.compact.en-US.Samantha
|
|
||||||
|
|||||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -8,6 +8,7 @@ node_modules/
|
|||||||
dist/
|
dist/
|
||||||
web-build/
|
web-build/
|
||||||
expo-env.d.ts
|
expo-env.d.ts
|
||||||
|
.tts-cache/
|
||||||
|
|
||||||
# Native
|
# Native
|
||||||
.kotlin/
|
.kotlin/
|
||||||
|
|||||||
34
README.md
34
README.md
@@ -109,7 +109,39 @@ npm run start
|
|||||||
|
|
||||||
## Speech Input And Playback
|
## Speech Input And Playback
|
||||||
|
|
||||||
Playback works directly in Expo Go. AI replies are read aloud through the iPhone speaker. The chat also has buttons for `Read last answer` and `Stop`.
|
Playback uses a local MP3 TTS server on the laptop. AI replies are sent to the laptop, converted to an MP3 with a Microsoft neural English voice, and then played on the iPhone. This avoids the robotic iPhone system voice.
|
||||||
|
|
||||||
|
Start the TTS server in a second terminal:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm run tts:start
|
||||||
|
```
|
||||||
|
|
||||||
|
For Expo Go on iPhone, `.env` must point to the laptop IP:
|
||||||
|
|
||||||
|
```text
|
||||||
|
EXPO_PUBLIC_TTS_BASE_URL=http://192.168.10.33:3333
|
||||||
|
EXPO_PUBLIC_TTS_VOICE=en-US-JennyNeural
|
||||||
|
EXPO_PUBLIC_TTS_RATE=0.88
|
||||||
|
EXPO_PUBLIC_TTS_PITCH=+0Hz
|
||||||
|
```
|
||||||
|
|
||||||
|
Useful English voices:
|
||||||
|
|
||||||
|
- `en-US-JennyNeural`
|
||||||
|
- `en-US-AvaNeural`
|
||||||
|
- `en-US-EmmaNeural`
|
||||||
|
- `en-US-GuyNeural`
|
||||||
|
- `en-GB-SoniaNeural`
|
||||||
|
- `en-GB-RyanNeural`
|
||||||
|
|
||||||
|
You can list available voices while the TTS server is running:
|
||||||
|
|
||||||
|
```text
|
||||||
|
http://192.168.10.33:3333/voices
|
||||||
|
```
|
||||||
|
|
||||||
|
The chat has buttons for `Read last answer` and `Stop`.
|
||||||
|
|
||||||
Speech input currently works through the iPhone keyboard:
|
Speech input currently works through the iPhone keyboard:
|
||||||
|
|
||||||
|
|||||||
935
package-lock.json
generated
935
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -7,14 +7,18 @@
|
|||||||
"android": "expo start --android",
|
"android": "expo start --android",
|
||||||
"ios": "expo start --ios",
|
"ios": "expo start --ios",
|
||||||
"web": "expo start --web",
|
"web": "expo start --web",
|
||||||
"prebuild:ios": "expo prebuild --platform ios"
|
"prebuild:ios": "expo prebuild --platform ios",
|
||||||
|
"tts:start": "node tools/tts-server.mjs"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@react-navigation/native": "^7.2.2",
|
"@react-navigation/native": "^7.2.2",
|
||||||
"@react-navigation/native-stack": "^7.14.12",
|
"@react-navigation/native-stack": "^7.14.12",
|
||||||
|
"cors": "^2.8.6",
|
||||||
"expo": "~54.0.33",
|
"expo": "~54.0.33",
|
||||||
"expo-speech": "~14.0.8",
|
"expo-av": "~16.0.8",
|
||||||
"expo-status-bar": "~3.0.9",
|
"expo-status-bar": "~3.0.9",
|
||||||
|
"express": "^5.2.1",
|
||||||
|
"msedge-tts": "^2.0.5",
|
||||||
"react": "19.1.0",
|
"react": "19.1.0",
|
||||||
"react-native": "0.81.5",
|
"react-native": "0.81.5",
|
||||||
"react-native-safe-area-context": "~5.6.0",
|
"react-native-safe-area-context": "~5.6.0",
|
||||||
|
|||||||
@@ -1,6 +0,0 @@
|
|||||||
export const speechConfig = {
|
|
||||||
language: process.env.EXPO_PUBLIC_SPEECH_LANGUAGE ?? 'en-US',
|
|
||||||
preferredVoice: process.env.EXPO_PUBLIC_SPEECH_VOICE,
|
|
||||||
pitch: Number(process.env.EXPO_PUBLIC_SPEECH_PITCH ?? 1),
|
|
||||||
rate: Number(process.env.EXPO_PUBLIC_SPEECH_RATE ?? 0.85),
|
|
||||||
};
|
|
||||||
6
src/config/tts.ts
Normal file
6
src/config/tts.ts
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
export const ttsConfig = {
|
||||||
|
baseUrl: process.env.EXPO_PUBLIC_TTS_BASE_URL ?? '',
|
||||||
|
voice: process.env.EXPO_PUBLIC_TTS_VOICE ?? 'en-US-JennyNeural',
|
||||||
|
rate: process.env.EXPO_PUBLIC_TTS_RATE ?? '-8%',
|
||||||
|
pitch: process.env.EXPO_PUBLIC_TTS_PITCH ?? '+0Hz',
|
||||||
|
};
|
||||||
@@ -1,43 +1,62 @@
|
|||||||
import * as Speech from 'expo-speech';
|
import { Audio, type AVPlaybackSource } from 'expo-av';
|
||||||
|
|
||||||
import { speechConfig } from '../config/speech';
|
import { ttsConfig } from '../config/tts';
|
||||||
|
|
||||||
let cachedVoice: string | undefined;
|
let currentSound: Audio.Sound | undefined;
|
||||||
|
|
||||||
export async function speakText(text: string) {
|
export async function speakText(text: string) {
|
||||||
Speech.stop();
|
await stopSpeaking();
|
||||||
|
|
||||||
const voice = await getPreferredVoice();
|
const cleanText = stripThinkingBlocks(text);
|
||||||
|
if (!cleanText || !ttsConfig.baseUrl) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
Speech.speak(stripThinkingBlocks(text), {
|
const response = await fetch(`${ttsConfig.baseUrl}/tts`, {
|
||||||
language: speechConfig.language,
|
method: 'POST',
|
||||||
pitch: speechConfig.pitch,
|
headers: {
|
||||||
rate: speechConfig.rate,
|
'Content-Type': 'application/json',
|
||||||
voice,
|
},
|
||||||
|
body: JSON.stringify({
|
||||||
|
text: cleanText,
|
||||||
|
voice: ttsConfig.voice,
|
||||||
|
rate: ttsConfig.rate,
|
||||||
|
pitch: ttsConfig.pitch,
|
||||||
|
}),
|
||||||
});
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`TTS request failed with ${response.status}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = (await response.json()) as { audioUrl?: string };
|
||||||
|
if (!data.audioUrl) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const source: AVPlaybackSource = { uri: data.audioUrl };
|
||||||
|
const { sound } = await Audio.Sound.createAsync(source, { shouldPlay: true });
|
||||||
|
currentSound = sound;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function stopSpeaking() {
|
export async function stopSpeaking() {
|
||||||
Speech.stop();
|
if (!currentSound) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
await currentSound.stopAsync();
|
||||||
|
await currentSound.unloadAsync();
|
||||||
|
currentSound = undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
function stripThinkingBlocks(text: string) {
|
function stripThinkingBlocks(text: string) {
|
||||||
return text.replace(/<think>[\s\S]*?<\/think>/gi, '').trim();
|
return text
|
||||||
}
|
.replace(/<think>[\s\S]*?<\/think>/gi, '')
|
||||||
|
.replace(/```[\s\S]*?```/g, ' ')
|
||||||
async function getPreferredVoice() {
|
.replace(/`([^`]+)`/g, '$1')
|
||||||
if (cachedVoice) {
|
.replace(/\*\*([^*]+)\*\*/g, '$1')
|
||||||
return cachedVoice;
|
.replace(/[*_#>~]/g, '')
|
||||||
}
|
.replace(/https?:\/\/\S+/g, '')
|
||||||
|
.replace(/\s+/g, ' ')
|
||||||
if (speechConfig.preferredVoice) {
|
.trim();
|
||||||
cachedVoice = speechConfig.preferredVoice;
|
|
||||||
return cachedVoice;
|
|
||||||
}
|
|
||||||
|
|
||||||
const voices = await Speech.getAvailableVoicesAsync();
|
|
||||||
const matchingVoice = voices.find((voice) => voice.language === speechConfig.language);
|
|
||||||
cachedVoice = matchingVoice?.identifier;
|
|
||||||
|
|
||||||
return cachedVoice;
|
|
||||||
}
|
}
|
||||||
|
|||||||
96
tools/tts-server.mjs
Normal file
96
tools/tts-server.mjs
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
import crypto from 'node:crypto';
|
||||||
|
import fs from 'node:fs';
|
||||||
|
import path from 'node:path';
|
||||||
|
import cors from 'cors';
|
||||||
|
import express from 'express';
|
||||||
|
import { MsEdgeTTS, OUTPUT_FORMAT } from 'msedge-tts';
|
||||||
|
|
||||||
|
const app = express();
|
||||||
|
const port = Number(process.env.TTS_PORT ?? 3333);
|
||||||
|
const host = process.env.TTS_HOST ?? '0.0.0.0';
|
||||||
|
const cacheDir = path.resolve(process.cwd(), '.tts-cache');
|
||||||
|
|
||||||
|
fs.mkdirSync(cacheDir, { recursive: true });
|
||||||
|
|
||||||
|
app.use(cors());
|
||||||
|
app.use(express.json({ limit: '64kb' }));
|
||||||
|
app.use('/audio', express.static(cacheDir, { maxAge: '1h' }));
|
||||||
|
|
||||||
|
app.get('/health', (_request, response) => {
|
||||||
|
response.json({ ok: true, voice: getVoiceFromEnv() });
|
||||||
|
});
|
||||||
|
|
||||||
|
app.get('/voices', async (_request, response) => {
|
||||||
|
const tts = new MsEdgeTTS({});
|
||||||
|
const voices = await tts.getVoices();
|
||||||
|
response.json(
|
||||||
|
voices
|
||||||
|
.filter((voice) => voice.Locale.startsWith('en-'))
|
||||||
|
.map((voice) => ({
|
||||||
|
name: voice.ShortName,
|
||||||
|
friendlyName: voice.FriendlyName,
|
||||||
|
gender: voice.Gender,
|
||||||
|
locale: voice.Locale,
|
||||||
|
})),
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
app.post('/tts', async (request, response) => {
|
||||||
|
const text = sanitizeText(String(request.body?.text ?? ''));
|
||||||
|
|
||||||
|
if (!text) {
|
||||||
|
response.status(400).json({ error: 'Missing text' });
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const voice = String(request.body?.voice ?? getVoiceFromEnv());
|
||||||
|
const rate = String(request.body?.rate ?? process.env.TTS_RATE ?? '-8%');
|
||||||
|
const pitch = String(request.body?.pitch ?? process.env.TTS_PITCH ?? '+0Hz');
|
||||||
|
const hash = crypto.createHash('sha256').update(`${voice}|${rate}|${pitch}|${text}`).digest('hex');
|
||||||
|
const filename = `${hash}.mp3`;
|
||||||
|
const filePath = path.join(cacheDir, filename);
|
||||||
|
|
||||||
|
if (!fs.existsSync(filePath)) {
|
||||||
|
const tts = new MsEdgeTTS({});
|
||||||
|
await tts.setMetadata(voice, OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3);
|
||||||
|
const { audioFilePath, metadataFilePath } = await tts.toFile(cacheDir, escapeXml(text), { rate, pitch });
|
||||||
|
fs.renameSync(audioFilePath, filePath);
|
||||||
|
if (metadataFilePath && fs.existsSync(metadataFilePath)) {
|
||||||
|
fs.unlinkSync(metadataFilePath);
|
||||||
|
}
|
||||||
|
tts.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
response.json({ audioUrl: `${request.protocol}://${request.get('host')}/audio/${filename}` });
|
||||||
|
});
|
||||||
|
|
||||||
|
app.listen(port, host, () => {
|
||||||
|
console.log(`TTS server listening on http://${host}:${port}`);
|
||||||
|
console.log(`Voice: ${getVoiceFromEnv()}`);
|
||||||
|
});
|
||||||
|
|
||||||
|
function getVoiceFromEnv() {
|
||||||
|
return process.env.TTS_VOICE ?? 'en-US-JennyNeural';
|
||||||
|
}
|
||||||
|
|
||||||
|
function sanitizeText(value) {
|
||||||
|
return value
|
||||||
|
.replace(/<think>[\s\S]*?<\/think>/gi, '')
|
||||||
|
.replace(/```[\s\S]*?```/g, ' ')
|
||||||
|
.replace(/`([^`]+)`/g, '$1')
|
||||||
|
.replace(/\*\*([^*]+)\*\*/g, '$1')
|
||||||
|
.replace(/[*_#>~]/g, '')
|
||||||
|
.replace(/https?:\/\/\S+/g, '')
|
||||||
|
.replace(/\s+/g, ' ')
|
||||||
|
.trim()
|
||||||
|
.slice(0, 1800);
|
||||||
|
}
|
||||||
|
|
||||||
|
function escapeXml(value) {
|
||||||
|
return value
|
||||||
|
.replace(/&/g, '&')
|
||||||
|
.replace(/</g, '<')
|
||||||
|
.replace(/>/g, '>')
|
||||||
|
.replace(/"/g, '"')
|
||||||
|
.replace(/'/g, ''');
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user