-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathpronounce_assessment_file.py
More file actions
110 lines (91 loc) · 6.32 KB
/
pronounce_assessment_file.py
File metadata and controls
110 lines (91 loc) · 6.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import azure.cognitiveservices.speech as speechsdk
import threading
import wave
import time
import string
import json
import intonation as intonate
# weatherfilename = 'output.wav'
speech_key = "YOUR_AZURE_SPEECH_STUDIO_API_KEY"
service_region = "eastus"
# filename='D:\\GPT\\GPT\\reading_evalaution\\zz.wav'
# # filename = r'F:\chatbot_keyboard\user_audio.wav'
# language = 'en-US'
# reference_text = 'The image shows a bar graph displaying the number of coffee and manufacturing sales in the United States from 2016 to 2018. The graph shows that there was a steady increase in sales from 2016 to 2018, with the highest sales occurring in August 2018. The graph also shows that there was a slight decrease in sales in December 2017 and January 2018.'
def pronunciation_assessment_configured_with_json(filename, language, reference_text):
average_intonation = 0
"""Performs pronunciation assessment asynchronously with input from an audio file.
See more information at https://aka.ms/csspeech/pa"""
# Creates an instance of a speech config with specified subscription key and service region.
# Replace with your own subscription key and service region (e.g., "westus").
# Note: The sample is for en-US language.
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
audio_config = speechsdk.audio.AudioConfig(filename=filename)
reference_text = reference_text
# Create pronunciation assessment config with json string (JSON format is not recommended)
enable_miscue, enable_prosody = True, True
config_json = {
"GradingSystem": "HundredMark",
"Granularity": "Phoneme",
"Dimension": "Comprehensive",
"ScenarioId": "", # "" is the default scenario or ask product team for a customized one
"EnableMiscue": enable_miscue,
"EnableProsodyAssessment": enable_prosody,
"NBestPhonemeCount": 0, # > 0 to enable "spoken phoneme" mode, 0 to disable
}
pronunciation_config = speechsdk.PronunciationAssessmentConfig(json_string=json.dumps(config_json))
pronunciation_config.reference_text = reference_text
# Create a speech recognizer using a file as audio input.
language = 'en-US'
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, language=language, audio_config=audio_config)
# Apply pronunciation assessment config to speech recognizer
pronunciation_config.apply_to(speech_recognizer)
result = speech_recognizer.recognize_once_async().get()
# Check the result
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
print('pronunciation assessment for: {}'.format(result.text))
pronunciation_result = json.loads(result.properties.get(speechsdk.PropertyId.SpeechServiceResponse_JsonResult))
# print('assessment results:\n{}'.format(json.dumps(pronunciation_result, indent=4)))
elif result.reason == speechsdk.ResultReason.NoMatch:
print("No speech could be recognized")
elif result.reason == speechsdk.ResultReason.Canceled:
cancellation_details = result.cancellation_details
print("Speech Recognition canceled: {}".format(cancellation_details.reason))
if cancellation_details.reason == speechsdk.CancellationReason.Error:
print("Error details: {}".format(cancellation_details.error_details))
per_word_pronounciation_assessment_result = []
final_pronounciation_assessment_result = []
pronunciation_result_parsed_01 = pronunciation_result['NBest'][0]
pronounciation_assessment_result = pronunciation_result_parsed_01['PronunciationAssessment']
# print(pronounciation_assessment_result)
final_pronounciation_assessment_result.append(pronounciation_assessment_result)
length = len(pronunciation_result_parsed_01['Words'])
for i in range(len(pronunciation_result_parsed_01['Words'])):
# del pronunciation_result_parsed_01['Words'][i]['Syllables']
del pronunciation_result_parsed_01['Words'][i]['Phonemes']
# print(pronunciation_result_parsed_01['Words'][i])
intonation_results = pronunciation_result_parsed_01['Words'][i]['PronunciationAssessment']['Feedback']['Prosody']['Intonation']
# print(intonation_results)
intonation = pronunciation_result_parsed_01['Words'][i]['PronunciationAssessment']['Feedback']['Prosody']['Intonation']['Monotone']['SyllablePitchDeltaConfidence']
average_intonation = (average_intonation + intonation)
# print(intonation)
per_word_pronounciation_assessment_result.append(pronunciation_result_parsed_01['Words'][i])
per_word_pronounciation_assessment_result.append(intonation_results)
# per_word_pronounciation_assessment_result.append(pitch_per_word[i])
del pronunciation_result_parsed_01['Words'][i]['PronunciationAssessment']['Feedback']
final_pronounciation_assessment_result.append(per_word_pronounciation_assessment_result)
# print(per_word_pronounciation_assessment_result)
# print(f'\n\n')
average_intonation = average_intonation / length
accuracy = pronounciation_assessment_result['AccuracyScore']/100
fluency = pronounciation_assessment_result['FluencyScore']/100
prosody_score = pronounciation_assessment_result['ProsodyScore']/100
completeness = pronounciation_assessment_result['CompletenessScore']/100
avg_pro_score = pronounciation_assessment_result['PronScore']/100
# print(f'Accuracy: {accuracy} \nFluency: {fluency} \nProsody Score: {prosody_score} \nCompleteness: {completeness} \nAverage Pronounciation Score: {avg_pro_score} \nIntonation: {average_intonation} \n\n')
# final_pronounciation_assessment_result.append(pitch_per_word)
# final_pronounciation_assessment_result.append(('overall_pitch: ' + str(overall_pitch)))
# print(final_pronounciation_assessment_result)
return accuracy, fluency, prosody_score, completeness, avg_pro_score, average_intonation, final_pronounciation_assessment_result
# accuracy, fluency, prosody_score, completeness, avg_pro_score, average_intonation, pitch_per_word, overall_pitch, final_pronounciation_assessment_result = pronunciation_assessment_configured_with_json(filename, language, reference_text)
# pronunciation_assessment_configured_with_json(filename, language, reference_text)