<video id="video" controls preload="metadata">
<source src="video/sintel-short.mp4" type="video/mp4">
<source src="video/sintel-short.webm" type="video/webm">
<track label="English" kind="subtitles" srclang="en" src="captions/vtt/sintel-en.vtt" default>
<track label="Deutsch" kind="subtitles" srclang="de" src="captions/vtt/sintel-de.vtt">
<track label="Español" kind="subtitles" srclang="es" src="captions/vtt/sintel-es.vtt">
</video>
// per the sample linked above you can feed the / append the captions
var subtitlesMenu;
if (video.textTracks) {
var df = document.createDocumentFragment();
var subtitlesMenu = df.appendChild(document.createElement('ul'));
subtitlesMenu.className = 'subtitles-menu';
subtitlesMenu.appendChild(createMenuItem('subtitles-off', '', 'Off'));
for (var i = 0; i < video.textTracks.length; i++) {
subtitlesMenu.appendChild(createMenuItem('subtitles-' + video.textTracks[i].language, video.textTracks[i].language, video.textTracks[i].label));
}
videoContainer.appendChild(subtitlesMenu);
}
If I am understanding you correctly, IMHO I would separate it into two parts
Transcribe the speec to text, like below from google api
and then do the caption as stream overlay
//
// Performs streaming speech recognition on raw PCM audio data.
//
// @param fileName the path to a PCM audio file to transcribe.
//
public static void streamingRecognizeFile(String fileName) throws Exception, IOException {
Path path = Paths.get(fileName);
byte[] data = Files.readAllBytes(path);
// Instantiates a client with GOOGLE_APPLICATION_CREDENTIALS
try (SpeechClient speech = SpeechClient.create()) {
// Configure request with local raw PCM audio
RecognitionConfig recConfig =
RecognitionConfig.newBuilder()
.setEncoding(AudioEncoding.LINEAR16)
.setLanguageCode("en-US")
.setSampleRateHertz(16000)
.setModel("default")
.build();
StreamingRecognitionConfig config =
StreamingRecognitionConfig.newBuilder().setConfig(recConfig).build();
class ResponseApiStreamingObserver<T> implements ApiStreamObserver<T> {
private final SettableFuture<List<T>> future = SettableFuture.create();
private final List<T> messages = new java.util.ArrayList<T>();
@Override
public void onNext(T message) {
messages.add(message);
}
@Override
public void onError(Throwable t) {
future.setException(t);
}
@Override
public void onCompleted() {
future.set(messages);
}
// Returns the SettableFuture object to get received messages / exceptions.
public SettableFuture<List<T>> future() {
return future;
}
}
ResponseApiStreamingObserver<StreamingRecognizeResponse> responseObserver =
new ResponseApiStreamingObserver<>();
BidiStreamingCallable<StreamingRecognizeRequest, StreamingRecognizeResponse> callable =
speech.streamingRecognizeCallable();
ApiStreamObserver<StreamingRecognizeRequest> requestObserver =
callable.bidiStreamingCall(responseObserver);
// The first request must **only** contain the audio configuration:
requestObserver.onNext(
StreamingRecognizeRequest.newBuilder().setStreamingConfig(config).build());
// Subsequent requests must **only** contain the audio data.
requestObserver.onNext(
StreamingRecognizeRequest.newBuilder()
.setAudioContent(ByteString.copyFrom(data))
.build());
// Mark transmission as completed after sending the data.
requestObserver.onCompleted();
List<StreamingRecognizeResponse> responses = responseObserver.future().get();
for (StreamingRecognizeResponse response : responses) {
// For streaming recognize, the results list has one is_final result (if available) followed
// by a number of in-progress results (if iterim_results is true) for subsequent utterances.
// Just print the first result here.
StreamingRecognitionResult result = response.getResultsList().get(0);
// There can be several alternative transcripts for a given chunk of speech. Just use the
// first (most likely) one here.
SpeechRecognitionAlternative alternative = result.getAlternativesList().get(0);
System.out.printf("Transcript : %s\n", alternative.getTranscript());
}
}
}
In the course of text conversion, you will need to break it down to pieces and because of this, the meaning may change slightly, since some expressions may have a different meaning than a single word, but this will help reduce the time of the final translation. Then send the already received segments (words, phrases) via API for translation.
You can choose several options you like (for example https://rapidapi.com/blog/best-translation-api/) and check which one will work faster. In my experience "Microsoft Translator Text" and "Google Translate" are some of the fastest. I also think that you will not be able to get instant translation, but if you test several API options and play around with whether to process all sentences, phrases or individual words at once, you can reduce the translation time to a minimum.
发布评论
评论(2)
如果我理解正确的话,恕我直言,我会将其分成两部分
将语音转录为文本,如下来自 google api
然后将标题作为流叠加
<前><代码>//
// 对原始 PCM 音频数据执行流式语音识别。
//
// @param fileName 要转录的 PCM 音频文件的路径。
//
公共静态无效streamingRecognizeFile(字符串文件名)抛出异常,IOException {
路径路径 = Paths.get(文件名);
byte[] 数据 = Files.readAllBytes(path);
// 使用 GOOGLE_APPLICATION_CREDENTIALS 实例化客户端
尝试(SpeechClient语音= SpeechClient.create()){
// 使用本地原始 PCM 音频配置请求
识别配置recConfig =
RecognitionConfig.newBuilder()
.setEncoding(音频编码.LINEAR16)
.setLanguageCode("en-US")
.setSampleRateHertz(16000)
.setModel("默认")
。建造();
StreamingRecognitionConfig 配置=
StreamingRecognitionConfig.newBuilder().setConfig(recConfig).build();
类 ResponseApiStreamingObserver实现 ApiStreamObserver {
私有最终 SettableFuture
>>未来 = SettableFuture.create(); messages = new java.util.ArrayList();
私人最终列表
@覆盖
公共无效onNext(T消息){
messages.add(消息);
}
@覆盖
公共无效onError(Throwable t){
future.setException(t);
}
@覆盖
公共无效onCompleted(){
future.set(消息);
}
// 返回 SettableFuture 对象以获取接收到的消息/异常。
公共 SettableFuture
>未来() {
返回未来;
}
}
ResponseApiStreamingObserver;响应观察者 =
新的 ResponseApiStreamingObserver<>();
BidiStreamingCallable可调用=
语音.streamingRecognizeCallable();
ApiStreamObserver;请求观察者=
callable.bidiStreamingCall(responseObserver);
// 第一个请求必须**仅**包含音频配置:
requestObserver.onNext(
StreamingRecognizeRequest.newBuilder().setStreamingConfig(config).build());
// 后续请求必须**仅**包含音频数据。
requestObserver.onNext(
StreamingRecognizeRequest.newBuilder()
.setAudioContent(ByteString.copyFrom(数据))
。建造());
// 发送数据后将传输标记为完成。
requestObserver.onCompleted();
列表响应=responseObserver.future().get();
for (StreamingRecognizeResponse 响应:响应) {
// 对于流式识别,结果列表后面有一个 is_final 结果(如果可用)
// 通过后续话语的多个正在进行的结果(如果 iterim_results 为 true)。
// 在这里打印第一个结果。
StreamingRecognitionResult 结果 = response.getResultsList().get(0);
// 对于给定的语音块可以有多个替代转录本。只需使用
// 第一个(最有可能)在这里。
SpeechRecognitionAlternative 替代方案 = result.getAlternativesList().get(0);
System.out.printf("成绩单:%s\n", Alternative.getTranscript());
}
}
}
对于您的移动语音 叠加
https://github.com/algolia/voice-overlay-android
对于网页 HTML 5 叠加
If I am understanding you correctly, IMHO I would separate it into two parts
Transcribe the speec to text, like below from google api
and then do the caption as stream overlay
For your mobile Voice overlay
https://github.com/algolia/voice-overlay-android
For web HTML 5 overlay
将语音转换为文本的最快、最有效的方法之一是 Java Speech API(文档位于 https://www.oracle.com/java/technologies/speech-api-frequently-asked-questions.html)
在文本转换的过程中,您需要将其分解成碎片,因此,含义可能会略有变化,因为某些表达可能与单个单词具有不同的含义,但这将有助于减少最终翻译的时间。然后通过 API 发送已收到的片段(单词、短语)进行翻译。
您可以选择您喜欢的多个选项(例如 https://rapidapi.com/blog/best -translation-api/)并检查哪一个工作得更快。根据我的经验,“微软文本翻译”和“谷歌翻译”是最快的。我还认为你将无法获得即时翻译,但如果你测试几个 API 选项并尝试是否一次处理所有句子、短语或单个单词,你可以将翻译时间减少到最低限度。
One of the fastest and most efficient ways to convert speech to text is Java Speech API (documentation at https://www.oracle.com/java/technologies/speech-api-frequently-asked-questions.html)
In the course of text conversion, you will need to break it down to pieces and because of this, the meaning may change slightly, since some expressions may have a different meaning than a single word, but this will help reduce the time of the final translation. Then send the already received segments (words, phrases) via API for translation.
You can choose several options you like (for example https://rapidapi.com/blog/best-translation-api/) and check which one will work faster. In my experience "Microsoft Translator Text" and "Google Translate" are some of the fastest. I also think that you will not be able to get instant translation, but if you test several API options and play around with whether to process all sentences, phrases or individual words at once, you can reduce the translation time to a minimum.