Нашли с одним товарищем решение, возможно, кому-то будет полезно.
Правильный скрипт:
#!/usr/bin/env python3
import sys
import time
sys.path.append("..")
from flask import Flask, render_template, request as flask_request, send_from_directory
import ssl
from tinkoff.cloud.tts.v1 import tts_pb2_grpc, tts_pb2
from auth import authorization_metadata
import grpc
import os
import wave
endpoint = os.environ.get("VOICEKIT_ENDPOINT") or "api.tinkoff.ai:443"
api_key = os.environ.get("VOICEKIT_API_KEY")
secret_key = os.environ.get("VOICEKIT_SECRET_KEY")
# txt = 'текст, который мы передаем в переменную для синтеза речи.'
# spk = 'alyona'
# txt = request.form['data-text']
sample_rate = 48000
ssl.create_default_https_context = ssl._create_unverified_context
app = Flask(__name__, static_folder='/')
storage = os.path.join(app.root_path, "storage")
if not os.path.exists(os.path.join(app.root_path, storage)):
os.makedirs(storage)
@app.route('/')
def index():
return render_template('index.html')
@app.route('/build_request', methods=['POST'])
def route_build_request_post():
data_text: str = flask_request.form.get('data_text')
speaker_name = flask_request.form.get('speaker_name')
filename = f'{time.strftime("%H-%M-%S", time.localtime())}.wav'
# with open(os.path.join(storage, filename), "wb") as f:f.write(data_text.encode())
with wave.open(filename, "wb") as f:
f.setframerate(sample_rate)
f.setnchannels(1)
f.setsampwidth(2)
stub = tts_pb2_grpc.TextToSpeechStub(
grpc.secure_channel(endpoint, grpc.ssl_channel_credentials()))
request = build_request(data_text, speaker_name)
metadata = authorization_metadata(api_key, secret_key, "tinkoff.cloud.tts")
responses = stub.StreamingSynthesize(request, metadata=metadata)
for key, value in responses.initial_metadata():
if key == "x-audio-duration-seconds":
print("Estimated audio duration is {:.2f} seconds".format(float(value)))
break
for stream_response in responses:
f.writeframes(stream_response.audio_chunk)
return send_from_directory(storage, filename, as_attachment=True)
def build_request(text, name):
return tts_pb2.SynthesizeSpeechRequest(
input=tts_pb2.SynthesisInput(
text=text
),
audio_config=tts_pb2.AudioConfig(
audio_encoding=tts_pb2.LINEAR16,
sample_rate_hertz=sample_rate,
),
voice=tts_pb2.VoiceSelectionParams(
name=name,
),
)
if __name__ == '__main__':
app.run(host='0.0.0.0', port=3000, debug=True)