iOS(Swift)语音转录 - 连续转录期间捕获最后一个字/数字
我正在为正在进行的语音应用程序而苦苦挣扎。我遵循了Dev示例,以在Apple的开发人员网站上创建语音识别器
但是,我的应用想法要求我在说话时以一系列数字捕获每个数字。有了以下代码,我可以成功地说一系列数字(例如“ 2、5、3、7、10、6 ...”),一旦我停止,它最终将返回一个sftranscription数组我说的数字。我最终之所以说,是因为语音识别器一直在不断尝试以人类语言或格式确定可理解的响应(在这种情况下,是电话号码,较大的多位数数字等),这是对听写和人类语言。但是我想让每个单词(数字)在尝试理解它之前所说的那样。是否有一种方法可以在识别器尝试将其与所有其他单词之前的识别器尝试之前抓住最后一句话?
import UIKit
import Speech
public class ViewController: UIViewController, SFSpeechRecognizerDelegate {
private let speechRecognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))!
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
private let audioEngine = AVAudioEngine()
@IBOutlet var textView: UITextView!
@IBOutlet var recordButton: UIButton!
private var isListening = false
public override func viewDidLoad() {
super.viewDidLoad()
recordButton.isEnabled = false
textView.isEditable = false
}
override public func viewDidAppear(_ animated: Bool) {
super.viewDidAppear(animated)
speechRecognizer.delegate = self
SFSpeechRecognizer.requestAuthorization { authStatus in
OperationQueue.main.addOperation {
switch authStatus {
case .authorized:
self.recordButton.isEnabled = true
case .denied:
self.recordButton.isEnabled = false
self.recordButton.setTitle("User denied access to speech recognition", for: .disabled)
case .restricted:
self.recordButton.isEnabled = false
self.recordButton.setTitle("Speech recognition restricted on this device", for: .disabled)
case .notDetermined:
self.recordButton.isEnabled = false
self.recordButton.setTitle("Speech recognition not yet authorized", for: .disabled)
default:
self.recordButton.isEnabled = false
}
}
}
}
private func startRecording() throws {
recognitionTask?.cancel()
self.recognitionTask = nil
let audioSession = AVAudioSession.sharedInstance()
try audioSession.setCategory(.record, mode: .measurement, options: .duckOthers)
try audioSession.setActive(true, options: .notifyOthersOnDeactivation)
let inputNode = audioEngine.inputNode
recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
guard let recognitionRequest = recognitionRequest else { fatalError("Unable to create a SFSpeechAudioBufferRecognitionRequest object") }
recognitionRequest.shouldReportPartialResults = true
if #available(iOS 13, *) {
recognitionRequest.requiresOnDeviceRecognition = false
}
recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest) { result, error in
var isFinal = false
if let result = result {
if self.isListening {
result.transcriptions.forEach { transcription in // Grab SFTranscription from result
transcription.segments.forEach { segment in
print( segment.substring )
}
}
print("---")
}
isFinal = result.isFinal
}
if error != nil || isFinal {
self.audioEngine.stop()
inputNode.removeTap(onBus: 0)
self.recognitionRequest = nil
self.recognitionTask = nil
self.recordButton.isEnabled = true
self.recordButton.setTitle("Start Recording", for: [])
}
}
let recordingFormat = inputNode.outputFormat(forBus: 0)
inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { (buffer: AVAudioPCMBuffer, when: AVAudioTime) in
self.recognitionRequest?.append(buffer)
}
audioEngine.prepare()
try audioEngine.start()
textView.text = "(Go ahead, I'm listening)"
}
public func speechRecognizer(_ speechRecognizer: SFSpeechRecognizer, availabilityDidChange available: Bool) {
if available {
recordButton.isEnabled = true
recordButton.setTitle("Start Recording", for: [])
} else {
recordButton.isEnabled = false
recordButton.setTitle("Recognition Not Available", for: .disabled)
}
}
@IBAction func recordButtonTapped() {
if audioEngine.isRunning {
audioEngine.stop()
recognitionRequest?.endAudio()
recordButton.isEnabled = false
recordButton.setTitle("Stopping", for: .disabled)
self.isListening = false
} else {
do {
try startRecording()
recordButton.setTitle("Stop Recording", for: [])
self.isListening = true
} catch {
recordButton.setTitle("Recording Not Available", for: [])
}
}
}
}
说“ 4、7、5、5、4、3” - “ ---”之后的每个块表示“ 4、7、5、5、4、3”的示例输出表示一个返回的成绩单中的所有段。
For
---
For
seven
---
47
---
475
---
4
7
5
---
4755
---
47554
---
475543
---
475543
---
4
7
5
5
4
3
我可以很容易地通过功能轻松地处理拼写的响应(例如,“对于“ 4”),但是长串联的字符串编号是使我陷入困境。我想在它们加入之前抓住它们,而不必等到结束,最终将它们分为单个细分市场。
感谢您的帮助!
I am struggling a bit with a Speech app I am working on. I followed the dev example for creating a Speech Recognizer on Apple's developer website here, and my code is below. It is working well, and I am getting continuous recognition as expected.
However, my app idea requires me to get capture each number in a series of numbers as they are spoken. With the code below, I can successfully speak a long series of numbers (e.g. "2, 5, 3, 7, 10, 6...") and once I stop it will eventually return an SFTranscription array with transcriptions holding segments for each number I spoke. The reason I say eventually, is because the speech recognizer is constantly trying to determine an intelligible response in human language or formats (in this case, phone numbers, larger multi digit numbers, etc.), which is what it should do for dictation and human language. But I would like to get each word (number) spoken as it is said before it tries to make sense of it. Is there a way to grab the last word before the recognizer attempts to relate it to all other words prior?
import UIKit
import Speech
public class ViewController: UIViewController, SFSpeechRecognizerDelegate {
private let speechRecognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))!
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
private var recognitionTask: SFSpeechRecognitionTask?
private let audioEngine = AVAudioEngine()
@IBOutlet var textView: UITextView!
@IBOutlet var recordButton: UIButton!
private var isListening = false
public override func viewDidLoad() {
super.viewDidLoad()
recordButton.isEnabled = false
textView.isEditable = false
}
override public func viewDidAppear(_ animated: Bool) {
super.viewDidAppear(animated)
speechRecognizer.delegate = self
SFSpeechRecognizer.requestAuthorization { authStatus in
OperationQueue.main.addOperation {
switch authStatus {
case .authorized:
self.recordButton.isEnabled = true
case .denied:
self.recordButton.isEnabled = false
self.recordButton.setTitle("User denied access to speech recognition", for: .disabled)
case .restricted:
self.recordButton.isEnabled = false
self.recordButton.setTitle("Speech recognition restricted on this device", for: .disabled)
case .notDetermined:
self.recordButton.isEnabled = false
self.recordButton.setTitle("Speech recognition not yet authorized", for: .disabled)
default:
self.recordButton.isEnabled = false
}
}
}
}
private func startRecording() throws {
recognitionTask?.cancel()
self.recognitionTask = nil
let audioSession = AVAudioSession.sharedInstance()
try audioSession.setCategory(.record, mode: .measurement, options: .duckOthers)
try audioSession.setActive(true, options: .notifyOthersOnDeactivation)
let inputNode = audioEngine.inputNode
recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
guard let recognitionRequest = recognitionRequest else { fatalError("Unable to create a SFSpeechAudioBufferRecognitionRequest object") }
recognitionRequest.shouldReportPartialResults = true
if #available(iOS 13, *) {
recognitionRequest.requiresOnDeviceRecognition = false
}
recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest) { result, error in
var isFinal = false
if let result = result {
if self.isListening {
result.transcriptions.forEach { transcription in // Grab SFTranscription from result
transcription.segments.forEach { segment in
print( segment.substring )
}
}
print("---")
}
isFinal = result.isFinal
}
if error != nil || isFinal {
self.audioEngine.stop()
inputNode.removeTap(onBus: 0)
self.recognitionRequest = nil
self.recognitionTask = nil
self.recordButton.isEnabled = true
self.recordButton.setTitle("Start Recording", for: [])
}
}
let recordingFormat = inputNode.outputFormat(forBus: 0)
inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { (buffer: AVAudioPCMBuffer, when: AVAudioTime) in
self.recognitionRequest?.append(buffer)
}
audioEngine.prepare()
try audioEngine.start()
textView.text = "(Go ahead, I'm listening)"
}
public func speechRecognizer(_ speechRecognizer: SFSpeechRecognizer, availabilityDidChange available: Bool) {
if available {
recordButton.isEnabled = true
recordButton.setTitle("Start Recording", for: [])
} else {
recordButton.isEnabled = false
recordButton.setTitle("Recognition Not Available", for: .disabled)
}
}
@IBAction func recordButtonTapped() {
if audioEngine.isRunning {
audioEngine.stop()
recognitionRequest?.endAudio()
recordButton.isEnabled = false
recordButton.setTitle("Stopping", for: .disabled)
self.isListening = false
} else {
do {
try startRecording()
recordButton.setTitle("Stop Recording", for: [])
self.isListening = true
} catch {
recordButton.setTitle("Recording Not Available", for: [])
}
}
}
}
Example output for saying "4, 7, 5, 5, 4, 3" - each block after "---" represents all segments in one returned transcript.
For
---
For
seven
---
47
---
475
---
4
7
5
---
4755
---
47554
---
475543
---
475543
---
4
7
5
5
4
3
I can handle the spelled out responses (e.g. "For" for “4”) pretty easily with a function, but the long concatenated string numbers are whats fouling me up. I want to grab them before they get concatenated, and not have to wait until the very end when it eventually separates them into individual segments.
Thanks for any help!
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论