【工具】分割音频并进行FFT
分割音频代码来源import waveimport osfrom pydub import AudioSegmentfrom pydub.silence import split_on_silence# 初始化audiopath = "G:/python_test/mic/20210201_120146.wav"audiotype = 'wav' #如果wav、mp4其他格式参看pydub.Aud
·
分割音频
import wave
import os
from pydub import AudioSegment
from pydub.silence import split_on_silence
# 初始化
audiopath = "G:/python_test/mic/20210201_120146.wav"
audiotype = 'wav' #如果wav、mp4其他格式参看pydub.AudioSegment的API
cut_time=1 #音频切割间隔1s
# 读入音频
print('读入音频')
sound = AudioSegment.from_file(audiopath, format=audiotype)
# sound = sound[:3*60*1000] #如果文件较大,先取前3分钟测试,根据测试结果,调整参数
duration = sound.duration_seconds * 1000 # 音频时长(ms)
print(sound.duration_seconds)
num=sound.duration_seconds/cut_time #切割成多少个文件
file_num=int(num)
print('wav file num = ',num, 'loss 1')
filepath = os.path.split(audiopath)[0] #获取当前处理音频的路径
cuts_path = filepath+'/cuts/'
print(cuts_path)
if not os.path.isdir(cuts_path):
os.makedirs(cuts_path)
for i in range(file_num+1): #加1是因为分割数可能是小数
begin=0 + cut_time*i*1000
end=begin+cut_time*1000
if end/1000 > file_num:
end=sound.duration_seconds*1000
print('begin=',begin,'end=',end)
cut_file_path=cuts_path+'cutxxx'+'.wav'
if i < 10:
cut_file_path=cut_file_path.replace('xxx','0'+str(i))
else:
cut_file_path=cut_file_path.replace('xxx',str(i))
cut_wav = sound[begin:end] #以毫秒为单位截取[begin, end]区间的音频
cut_wav.export(cut_file_path, format='wav') #存储新的wav文件
print('保存完毕')
FFT
import os
import csv
import numpy as np
import json
import numpy as np
from scipy.io import wavfile
import matplotlib.pyplot as plt
def get_front_2_last (line, front, last):
begin=line.find(front)+len(front)
word=line[begin:]
end=word.find(last)
# print("begin=",begin,"end=",end)
ss=word[:end]
# print(ss)
return ss
def fft_pict(file_name):
num=get_front_2_last(file_name,"/cut",".wav")
print(num)
xlab='Freq (kHz)--'+num
save_file_name="fft_pict/"+num+".png"
sampling_freq, audio = wavfile.read(file_name) # 读取文件
audio = audio / np.max(audio) # 归一化,标准化
# 应用傅里叶变换
fft_signal = np.fft.fft(audio)
# print(fft_signal)
# [-0.04022912+0.j -0.04068997-0.00052721j -0.03933007-0.00448355j
# ... -0.03947908+0.00298096j -0.03933007+0.00448355j -0.04068997+0.00052721j]
fft_signal = abs(fft_signal)
# print(fft_signal)
# [0.04022912 0.04069339 0.0395848 ... 0.08001755 0.09203427 0.12889393]
# 建立时间轴
Freq = np.arange(0, len(fft_signal))
# 绘制语音信号的
plt.plot(Freq, fft_signal, color='blue')
plt.xlabel(xlab)
plt.xlim(0, 2000)
plt.ylabel('Amplitude')
plt.savefig(save_file_name)
plt.clf()
resultdir="cuts"
plt.figure() #不能放在函数里,否则处理速度会很慢,并且会提示窗口创建过多
for root, dirs, files in os.walk(resultdir):
for f in files:
print(f)
print(root)
f_name=root+'/'+f
fft_pict(f_name)
plt.close("all")
打开过多窗口解决
我使用的是只创建一个窗口,然后每次打开文件后清除内容。
之前把创建窗口命令放在fft_pict函数里,导致每次处理需要创建窗口,速度特别慢。
求语谱图
import numpy as np
import matplotlib.pyplot as plt
import os
import wave
# #读入音频。
# path = "E:\SpeechWarehouse\zmkm"
# name = 'zmkm0.wav'
# #我音频的路径为E:\SpeechWarehouse\zmkm\zmkm0.wav
# filename = os.path.join(path, name)
# 打开语音文件。
f = wave.open("20210201_120146.wav")
# 得到语音参数
params = f.getparams()
nchannels, sampwidth, framerate,nframes = params[:4]
# nframes该音频的采样数
#---------------------------------------------------------------#
# 将字符串格式的数据转成int型
print("reading wav file......")
strData = f.readframes(nframes)
waveData = np.frombuffer(strData,dtype=np.short)
# 归一化
waveData = waveData * 1.0/max(abs(waveData))
#将音频信号规整乘每行一路通道信号的格式,即该矩阵一行为一个通道的采样点,共nchannels行
waveData = np.reshape(waveData,[nframes,nchannels]).T # .T 表示转置
f.close()#关闭文件
print("file is closed!")
# #----------------------------------------------------------------#
# '''绘制语音波形'''
# print("plotting signal wave...")
# time = np.arange(0,nframes) * (1.0 / framerate)#计算时间
# time= np.reshape(time,[nframes,1]).T
# plt.plot(time[0,:nframes],waveData[0,:nframes],c="b")
# plt.xlabel("time")
# plt.ylabel("amplitude")
# plt.title("Original wave")
# plt.show()
#--------------------------------------------------------------#
'''
绘制频谱
1.求出帧长、帧叠点数。且FFT点数等于每帧点数(即不补零)
2.绘制语谱图
'''
print("plotting spectrogram...")
framelength = 0.025 #帧长20~30ms
framesize = framelength*framerate #每帧点数 N = t*fs,通常情况下值为256或512,要与NFFT相等\
#而NFFT最好取2的整数次方,即framesize最好取的整数次方
#找到与当前framesize最接近的2的正整数次方
nfftdict = {}
lists = [32,64,128,256,512,1024]
for i in lists:
nfftdict[i] = abs(framesize - i)
sortlist = sorted(nfftdict.items(), key=lambda x: x[1])#按与当前framesize差值升序排列
framesize = int(sortlist[0][0])#取最接近当前framesize的那个2的正整数次方值为新的framesize
NFFT = framesize #NFFT必须与时域的点数framsize相等,即不补零的FFT
overlapSize = 1.0/3 * framesize #重叠部分采样点数overlapSize约为每帧点数的1/3~1/2
overlapSize = int(round(overlapSize))#取整
print("帧长为{},帧叠为{},傅里叶变换点数为{}".format(framesize,overlapSize,NFFT))
# spectrum,freqs,ts,fig = plt.specgram(waveData[0],NFFT = NFFT,Fs =framerate,window=np.hanning(M = framesize),noverlap=overlapSize,mode='default',scale_by_freq=True,sides='default',scale='dB',xextent=None)#绘制频谱图
print("NFFT",NFFT)
print("overlapSize",overlapSize)
#----------------------------------------------------------------#
'''绘制语音波形'''
print("plotting signal wave...")
time = np.arange(0,nframes) * (1.0 / framerate)#计算时间
time_len=nframes * (1.0 / framerate)
time= np.reshape(time,[nframes,1]).T
plt.subplot(211) #两行,第一列第一行
plt.plot(time[0,:nframes],waveData[0,:nframes],c="b") #c表示颜色为青绿色
plt.xlim(0,time_len) #限制x轴显示范围0-x秒
plt.xlabel("time")
plt.ylabel("amplitude")
plt.title("Original wave")
plt.subplot(212) #两行,第一列第二行
spectrum,freqs,ts,fig = plt.specgram(waveData[0],NFFT = NFFT,Fs =framerate,window=np.hanning(M = framesize),noverlap=overlapSize,mode='default',scale_by_freq=True,sides='default',scale='dB',xextent=None)#绘制频谱图
plt.ylim(0,2000) #限制y轴显示范围为0-X频率
plt.ylabel('Frequency')
plt.xlabel('Time')
plt.title("Spectrogram")
plt.tight_layout() #调整每隔子图之间的距离
plt.savefig('all.png') # save the pict
plt.show()
效果图
FFT
语谱图
更多推荐
已为社区贡献6条内容
所有评论(0)