Soprano-RVC

Runtime error

App Files Files Community

Soprano-RVC / app.py

ekwek

Update app.py

278ac29 verified 6 months ago

Raw

History Blame Contribute Delete

3.88 kB

	import gradio as gr
	import torch
	import numpy as np
	from soprano import SopranoTTS
	from scipy.io.wavfile import write as wav_write
	import tempfile
	import os
	import spaces

	assert torch.cuda.is_available(), "Demo requires a GPU."
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	print(DEVICE)

	model = None

	def load_model():
	global model
	if model is None:
	# Load model once
	model = SopranoTTS(
	backend="auto",
	device=DEVICE,
	cache_size_mb=100,
	decoder_batch_size=1,
	)
	return model


	SAMPLE_RATE = 32000

	@spaces.GPU
	def tts_stream(text, temperature, top_p, repetition_penalty, state):
	model = load_model()

	if not text.strip():
	yield None, state
	return

	out = model.infer(
	text,
	temperature=temperature,
	top_p=top_p,
	repetition_penalty=repetition_penalty,
	)

	audio_np = out.cpu().numpy()
	yield (SAMPLE_RATE, audio_np), audio_np


	def save_audio(state):
	if state is None or len(state) == 0:
	return None
	fd, path = tempfile.mkstemp(suffix=".wav")
	os.close(fd)
	wav_write(path, SAMPLE_RATE, state)
	return path


	with gr.Blocks() as demo:
	state_audio = gr.State(None)

	with gr.Row():
	with gr.Column():
	gr.Markdown("# Soprano Demo\n\nSoprano is an ultra‑lightweight, open‑source text‑to‑speech (TTS) model designed for real‑time, high‑fidelity speech synthesis at unprecedented speed. Soprano can achieve <15 ms streaming latency and up to 2000x real-time generation, all while being easy to deploy at <1 GB VRAM usage.\n\nGithub: https://github.com/ekwek1/soprano\n\nModel Weights: https://huggingface.co/ekwek/Soprano-80M")

	text_in = gr.Textbox(
	label="Input Text",
	placeholder="Enter text to synthesize...",
	value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
	lines=4,
	)

	with gr.Accordion("Advanced options", open=False):
	temperature = gr.Slider(
	0.0, 1.0, value=0.3, step=0.05, label="Temperature"
	)
	top_p = gr.Slider(
	0.0, 1.0, value=0.95, step=0.01, label="Top-p"
	)
	repetition_penalty = gr.Slider(
	1.0, 2.0, value=1.2, step=0.05, label="Repetition penalty"
	)

	gen_btn = gr.Button("Generate")

	with gr.Column():
	audio_out = gr.Audio(
	label="Output Audio",
	autoplay=True,
	streaming=False,
	)
	#download_btn = gr.Button("Download")
	#file_out = gr.File(label="Download file")
	gr.Markdown(
	"Usage tips:\n\n"
	"- Soprano works best when each sentence is between 2 and 15 seconds long.\n"
	"- Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them. Best results can be achieved by converting these into their phonetic form. (1+1 -> one plus one, etc)\n"
	"- If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation. You may also change the sampling settings for more varied results.\n"
	"- Avoid improper grammar such as not using contractions, multiple spaces, etc."
	)


	gen_btn.click(
	fn=tts_stream,
	inputs=[text_in, temperature, top_p, repetition_penalty, state_audio],
	outputs=[audio_out, state_audio],
	)

	#download_btn.click(
	# fn=save_audio,
	# inputs=[state_audio],
	# outputs=[file_out],
	#)

	demo.queue()
	demo.launch()