Unity AIUI

上篇回顾https://blog.csdn.net/weixin_42208093/article/details/106277629在上篇文章中，我们讲到了讯飞的语音识别和合成以及百度的UNIT人机交互。但由于百度的语义技能过少达不到我们想要的要求。类如音乐、故事部分并没有实际的内容，所以我又重新好好研究了一下讯飞AIUI。但由于Window版本的AIUIDLL我们无法使用，就选择了使用WebA

VAIN_K

1098人浏览 · 2020-05-27 18:56:35

VAIN_K · 2020-05-27 18:56:35 发布

上篇回顾

https://blog.csdn.net/weixin_42208093/article/details/106277629
在上篇文章中，我们讲到了讯飞的语音识别和合成以及百度的UNIT 人机交互。
但由于百度的语义技能过少达不到我们想要的要求。类如音乐、故事部分并没有实际的内容，所以我又重新好好研究了一下讯飞AIUI。但由于Window版本的AIUIDLL我们无法使用，就选择了使用WebAPI版AIUI，效果也是一样的。

WebAPI AIUI

WebAPI创建就不说了，在创建好WebAPI AIUI项目后，项目中的开发工具中有C#的示例代码
在这里插入图片描述
我们只需要稍微修改下就行，这里就直接贴出修改好的代码。

注意：
1.APPID、API_KEY 是WebAPI的，和之前的SDK版本不一样。
2.另外AUTH_ID 是自己定义的，如果不知道怎么定义可以直接复制【开发工具】中的 “authId”。
3.示例代码中private const string SCENE = “main”; 修改为 “main_box”，不然会出现询问问题，没有回答的情况【当前页面配置修改仅在测试环境生效，设备端体验需要SDK传参时在情景模式后加“_box”或“更新发布”至生产环境体验。】
4.询问的问题，需要是已添加的商店技能里的问题才能做出正确的回答

using System;
using System.Collections.Generic;
using System.IO;
using System.Net;
using System.Security.Cryptography;
using System.Text;
using UnityEngine;

public class AIUI : MonoBehaviour
{
	private const string URL = "http://openapi.xfyun.cn/v2/aiui";
	private const string APPID = "";
	private const string API_KEY = "";
	private const string AUE = "raw";
	private const string AUTH_ID = "2049a1b2fdedae553bd03ce6f4820ac5";
	private const string DATA_TYPE = "audio";
	private const string SAMPLE_RATE = "16000";
	private const string SCENE = "main_box";//需要加_box
	private const string RESULT_LEVEL = "plain";
	private string FILE_PATH = "";

	void Start()
	{
		//测试
		FILE_PATH = Application.streamingAssetsPath + "/Baidu-TTS.pcm";
		Dictionary<string, string> header = buildHeader();
		byte[] dataByteArray = readFile(FILE_PATH);
		print(aiui(dataByteArray));
	}

	public static string aiui(byte[] boby)
	{
		Dictionary<string, string> header = buildHeader();
		string result = HttpPost(URL, header, boby);
		return result;
	}

	/// <summary>
	/// http post访问
	/// </summary>
	/// <param name="url">链接</param>
	/// <param name="headerDic">头字典</param>
	/// <param name="body">主体</param>
	/// <returns></returns>
	private static string HttpPost(string url, Dictionary<string, string> headerDic, byte[] body)
	{
		HttpWebRequest httpWebRequest = null;
		HttpWebResponse httpWebResponse = null;
		string result = "";
		try
		{
			httpWebRequest = (HttpWebRequest)WebRequest.Create(url);
			httpWebRequest.Method = "post";

			httpWebRequest.Headers.Add("X-Param", headerDic["X-Param"]);
			httpWebRequest.Headers.Add("X-CurTime", headerDic["X-CurTime"]);
			httpWebRequest.Headers.Add("X-CheckSum", headerDic["X-CheckSum"]);
			httpWebRequest.Headers.Add("X-Appid", headerDic["X-Appid"]);

			httpWebRequest.ContentLength = body.Length;
			httpWebRequest.GetRequestStream().Write(body, 0, body.Length);
			httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
			Stream responseStream = httpWebResponse.GetResponseStream();
			result = new StreamReader(responseStream, Encoding.UTF8).ReadToEnd();
			responseStream.Close();
			httpWebRequest.Abort();
			httpWebResponse.Close();
		}
		catch (Exception ex)
		{
			Debug.LogError("Error: " + ex.ToString());
		}
		return result;
	}

	/// <summary>
	/// 头字典
	/// </summary>
	/// <returns></returns>
	private static Dictionary<string, string> buildHeader()
	{
		TimeSpan ts = DateTime.UtcNow - new DateTime(1970, 1, 1, 0, 0, 0, 0);
		string curTime = Convert.ToInt64(ts.TotalSeconds).ToString();
		//请求参数
		string param = "{\"aue\":\"" + AUE + "\",\"result_level\":\"" + RESULT_LEVEL + "\",\"sample_rate\":\"" + SAMPLE_RATE + "\",\"auth_id\":\"" + AUTH_ID + "\",\"data_type\":\"" + DATA_TYPE + "\",\"scene\":\"" + SCENE + "\"}";
		string paramBase64 = Convert.ToBase64String(Encoding.Default.GetBytes(param));
		string checkSum = EncryptWithMD5(API_KEY + curTime + paramBase64);

		Dictionary<string, string> header = new Dictionary<string, string>();
		header.Add("X-Param", paramBase64);
		header.Add("X-CurTime", curTime);
		header.Add("X-CheckSum", checkSum);
		header.Add("X-Appid", APPID);
		return header;
	}

	/// <summary>
	/// MD5加密
	/// </summary>
	/// <param name="source">参数</param>
	/// <returns></returns>
	private static string EncryptWithMD5(string source)
	{
		byte[] sor = Encoding.UTF8.GetBytes(source);
		MD5 md5 = MD5.Create();
		byte[] result = md5.ComputeHash(sor);
		StringBuilder strbul = new StringBuilder(40);
		for (int i = 0; i < result.Length; i++)
		{
			//加密结果"x2"结果为32位,"x3"结果为48位,"x4"结果为64位
			strbul.Append(result[i].ToString("x2"));
		}
		return strbul.ToString();
	}

	private byte[] readFile(string filePath)
	{
		FileStream fs = new FileStream(filePath, FileMode.Open);
		byte[] data = new byte[fs.Length];
		fs.Read(data, 0, data.Length);
		fs.Close();
		return data;
	}
}

测试

下面我们先简单的测试一下，把AIUI能够跑通。我们需要一段询问问题（类如问天气）的音频文件，可以自己录一段，也可以用语音合成生成一段。我是用的百度的语音合成的，然后直接下载。
https://ai.baidu.com/tech/speech/tts_online
在这里插入图片描述
下载下来是Mp3格式的音频，我们还需要做个格式转换，转换成AIUI能识别的.pcm或者.wav
这里我用的是【Cool Edit Pro 2.1】百度自己下一个就行。
另存为：windows pcm 格式就行

添加到unity中修改脚本音频路径。

运行

在这里插入图片描述
通常回答的答案都是在 answer.text中，个别有提供是url，就不是在这了。
json 解析这部分就不说了。

优化

讯飞语音合成这一块做的并不是特别好，Windows 在线版的，其他的不了解。处理合成的时间会随着合成内容的大小而发生变化，100多字的内容，差不多需要5-6秒的处理时间，这个还是很影响用户体验的。百度了一下微软有自己的C#语音合成库：Interop.SpeechLib。处理合成比讯飞快的不是一点点，用法也是特别的简单。需要用线程的来做，不然unity会有假死的状态。

MP3转WAV

AIUI中的【故事】【国学】【音乐】【相声】等技能都是提供的url链接，格式为MP3的格式，unity的WWW或者UnityWebRequest无法加载MP3格式的，就需要我们将MP3转成WAV或者其他可用的格式。
这里使用了NAudio.dll 引用 using NAudio.Wave;

    /// <summary>
    /// UnityWebRequest 加载MP3播放
    /// </summary>
    /// <param name="url"></param>
    /// <param name="audio"></param>
    /// <returns></returns>
    public IEnumerator OnMP3LoadAndPlay(string url, AudioSource audio)
    {
        UnityWebRequest www = UnityWebRequest.Get(url);
        yield return www.SendWebRequest();
        audio.clip = FromMp3Data(www.downloadHandler.data);
        audio.Play();
    }

    /// <summary>
    /// mp3转wav
    /// </summary>
    /// <param name="data">byte[]</param>
    /// <returns></returns>
    AudioClip FromMp3Data(byte[] data)
    {
        //将数据加载到流中
        MemoryStream mp3stream = new MemoryStream(data);
        //将流中的数据转换为WAV格式
        Mp3FileReader mp3audio = new Mp3FileReader(mp3stream);
        WaveStream waveStream = WaveFormatConversionStream.CreatePcmStream(mp3audio);
        //转换为WAV数据
        WAV wav = new WAV(AudioMemStream(waveStream).ToArray());
        AudioClip audioClip = AudioClip.Create("testSound", wav.SampleCount, 1, wav.Frequency, false);
        audioClip.SetData(wav.LeftChannel, 0);
        return audioClip;
    }

    /// <summary>
    /// 内存流
    /// </summary>
    /// <param name="waveStream">byte[]</param>
    /// <returns></returns>
    MemoryStream AudioMemStream(WaveStream waveStream)
    {
        MemoryStream outputStream = new MemoryStream();
        using (WaveFileWriter waveFileWriter = new WaveFileWriter(outputStream, waveStream.WaveFormat))
        {
            byte[] bytes = new byte[waveStream.Length];
            waveStream.Position = 0;
            waveStream.Read(bytes, 0, Convert.ToInt32(waveStream.Length));
            waveFileWriter.Write(bytes, 0, bytes.Length);
            waveFileWriter.Flush();
        }
        return outputStream;
    }

WAV.cs

/* From http://answers.unity3d.com/questions/737002/wav-byte-to-audioclip.html */
public class WAV
{

    // convert two bytes to one float in the range -1 to 1
    static float bytesToFloat(byte firstByte, byte secondByte)
    {
        // convert two bytes to one short (little endian)
        short s = (short)((secondByte << 8) | firstByte);
        // convert to range from -1 to (just below) 1
        return s / 32768.0F;
    }

    static int bytesToInt(byte[] bytes, int offset = 0)
    {
        int value = 0;
        for (int i = 0; i < 4; i++)
        {
            value |= ((int)bytes[offset + i]) << (i * 8);
        }
        return value;
    }
    // properties
    public float[] LeftChannel { get; internal set; }
    public float[] RightChannel { get; internal set; }
    public int ChannelCount { get; internal set; }
    public int SampleCount { get; internal set; }
    public int Frequency { get; internal set; }

    public WAV(byte[] wav)
    {

        // Determine if mono or stereo
        ChannelCount = wav[22];     // Forget byte 23 as 99.999% of WAVs are 1 or 2 channels

        // Get the frequency
        Frequency = bytesToInt(wav, 24);

        // Get past all the other sub chunks to get to the data subchunk:
        int pos = 12;   // First Subchunk ID from 12 to 16

        // Keep iterating until we find the data chunk (i.e. 64 61 74 61 ...... (i.e. 100 97 116 97 in decimal))
        while (!(wav[pos] == 100 && wav[pos + 1] == 97 && wav[pos + 2] == 116 && wav[pos + 3] == 97))
        {
            pos += 4;
            int chunkSize = wav[pos] + wav[pos + 1] * 256 + wav[pos + 2] * 65536 + wav[pos + 3] * 16777216;
            pos += 4 + chunkSize;
        }
        pos += 8;

        // Pos is now positioned to start of actual sound data.
        SampleCount = (wav.Length - pos) / 2;     // 2 bytes per sample (16 bit sound mono)
        if (ChannelCount == 2) SampleCount /= 2;        // 4 bytes per sample (16 bit stereo)

        // Allocate memory (right will be null if only mono sound)
        LeftChannel = new float[SampleCount];
        if (ChannelCount == 2) RightChannel = new float[SampleCount];
        else RightChannel = null;

        // Write to double array/s:
        int i = 0;
        int maxInput = wav.Length - (RightChannel == null ? 1 : 3);
        // while (pos < wav.Length)
        while ((i < SampleCount) && (pos < maxInput))
        {
            LeftChannel[i] = bytesToFloat(wav[pos], wav[pos + 1]);
            pos += 2;
            if (ChannelCount == 2)
            {
                RightChannel[i] = bytesToFloat(wav[pos], wav[pos + 1]);
                pos += 2;
            }
            i++;
        }
    }

    public override string ToString()
    {
        return string.Format("[WAV: LeftChannel={0}, RightChannel={1}, ChannelCount={2}, SampleCount={3}, Frequency={4}]", LeftChannel, RightChannel, ChannelCount, SampleCount, Frequency);
    }
}