This answer comes a year later, but anyway, for the record. I found three audio libraries with python bindings that extract features from audio. They are not that easy to install since they are really in C and you need to properly compile the python bindings and add them to the path to import, but here they are:
Python bindings are rich, though installing Echo Nest can be pain as the team does not seem to be able to build solid installers.
However it does not do local processing. Instead, it calculates audio fingerprint and uploads the song for Echo Nest servers for the information extraction using algorithms they don't expose.
import wave, array, math, time, argparse, sys
import numpy, pywt
from scipy import signal
import pdb
import matplotlib.pyplot as plt
def read_wav(filename):
#open file, get metadata for audio
try:
wf = wave.open(filename,'rb')
except IOError, e:
print e
return
# typ = choose_type( wf.getsampwidth() ) #TODO: implement choose_type
nsamps = wf.getnframes();
assert(nsamps > 0);
fs = wf.getframerate()
assert(fs > 0)
# read entire file and make into an array
samps = list(array.array('i',wf.readframes(nsamps)))
#print 'Read', nsamps,'samples from', filename
try:
assert(nsamps == len(samps))
except AssertionError, e:
print nsamps, "not equal to", len(samps)
return samps, fs
# print an error when no data can be found
def no_audio_data():
print "No audio data for sample, skipping..."
return None, None
# simple peak detection
def peak_detect(data):
max_val = numpy.amax(abs(data))
peak_ndx = numpy.where(data==max_val)
if len(peak_ndx[0]) == 0: #if nothing found then the max must be negative
peak_ndx = numpy.where(data==-max_val)
return peak_ndx
def bpm_detector(data,fs):
cA = []
cD = []
correl = []
cD_sum = []
levels = 4
max_decimation = 2**(levels-1);
min_ndx = 60./ 220 * (fs/max_decimation)
max_ndx = 60./ 40 * (fs/max_decimation)
for loop in range(0,levels):
cD = []
# 1) DWT
if loop == 0:
[cA,cD] = pywt.dwt(data,'db4');
cD_minlen = len(cD)/max_decimation+1;
cD_sum = numpy.zeros(cD_minlen);
else:
[cA,cD] = pywt.dwt(cA,'db4');
# 2) Filter
cD = signal.lfilter([0.01],[1 -0.99],cD);
# 4) Subtractargs.filename out the mean.
# 5) Decimate for reconstruction later.
cD = abs(cD[::(2**(levels-loop-1))]);
cD = cD - numpy.mean(cD);
# 6) Recombine the signal before ACF
# essentially, each level I concatenate
# the detail coefs (i.e. the HPF values)
# to the beginning of the array
cD_sum = cD[0:cD_minlen] + cD_sum;
if [b for b in cA if b != 0.0] == []:
return no_audio_data()
# adding in the approximate data as well...
cA = signal.lfilter([0.01],[1 -0.99],cA);
cA = abs(cA);
cA = cA - numpy.mean(cA);
cD_sum = cA[0:cD_minlen] + cD_sum;
# ACF
correl = numpy.correlate(cD_sum,cD_sum,'full')
midpoint = len(correl) / 2
correl_midpoint_tmp = correl[midpoint:]
peak_ndx = peak_detect(correl_midpoint_tmp[min_ndx:max_ndx]);
if len(peak_ndx) > 1:
return no_audio_data()
peak_ndx_adjusted = peak_ndx[0]+min_ndx;
bpm = 60./ peak_ndx_adjusted * (fs/max_decimation)
print bpm
return bpm,correl
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Process .wav file to determine the Beats Per Minute.')
parser.add_argument('--filename', required=True,
help='.wav file for processing')
parser.add_argument('--window', type=float, default=3,
help='size of the the window (seconds) that will be scanned to determine the bpm. Typically less than 10 seconds. [3]')
args = parser.parse_args()
samps,fs = read_wav(args.filename)
data = []
correl=[]
bpm = 0
n=0;
nsamps = len(samps)
window_samps = int(args.window*fs)
samps_ndx = 0; #first sample in window_ndx
max_window_ndx = nsamps / window_samps;
bpms = numpy.zeros(max_window_ndx)
#iterate through all windows
for window_ndx in xrange(0,max_window_ndx):
#get a new set of samples
#print n,":",len(bpms),":",max_window_ndx,":",fs,":",nsamps,":",samps_ndx
data = samps[samps_ndx:samps_ndx+window_samps]
if not ((len(data) % window_samps) == 0):
raise AssertionError( str(len(data) ) )
bpm, correl_temp = bpm_detector(data,fs)
if bpm == None:
continue
bpms[window_ndx] = bpm
correl = correl_temp
#iterate at the end of the loop
samps_ndx = samps_ndx+window_samps;
n=n+1; #counter for debug...
bpm = numpy.median(bpms)
print 'Completed. Estimated Beats Per Minute:', bpm
n = range(0,len(correl))
plt.plot(n,abs(correl));
plt.show(False); #plot non-blocking
time.sleep(10);
plt.close();
i've found this code by @scaperot here that could help you:
import wave, array, math, time, argparse, sys
import numpy, pywt
from scipy import signal
import pdb
import matplotlib.pyplot as plt
def read_wav(filename):
#open file, get metadata for audio
try:
wf = wave.open(filename,'rb')
except IOError, e:
print e
return
# typ = choose_type( wf.getsampwidth() ) #TODO: implement choose_type
nsamps = wf.getnframes();
assert(nsamps > 0);
fs = wf.getframerate()
assert(fs > 0)
# read entire file and make into an array
samps = list(array.array('i',wf.readframes(nsamps)))
#print 'Read', nsamps,'samples from', filename
try:
assert(nsamps == len(samps))
except AssertionError, e:
print nsamps, "not equal to", len(samps)
return samps, fs
# print an error when no data can be found
def no_audio_data():
print "No audio data for sample, skipping..."
return None, None
# simple peak detection
def peak_detect(data):
max_val = numpy.amax(abs(data))
peak_ndx = numpy.where(data==max_val)
if len(peak_ndx[0]) == 0: #if nothing found then the max must be negative
peak_ndx = numpy.where(data==-max_val)
return peak_ndx
def bpm_detector(data,fs):
cA = []
cD = []
correl = []
cD_sum = []
levels = 4
max_decimation = 2**(levels-1);
min_ndx = 60./ 220 * (fs/max_decimation)
max_ndx = 60./ 40 * (fs/max_decimation)
for loop in range(0,levels):
cD = []
# 1) DWT
if loop == 0:
[cA,cD] = pywt.dwt(data,'db4');
cD_minlen = len(cD)/max_decimation+1;
cD_sum = numpy.zeros(cD_minlen);
else:
[cA,cD] = pywt.dwt(cA,'db4');
# 2) Filter
cD = signal.lfilter([0.01],[1 -0.99],cD);
# 4) Subtractargs.filename out the mean.
# 5) Decimate for reconstruction later.
cD = abs(cD[::(2**(levels-loop-1))]);
cD = cD - numpy.mean(cD);
# 6) Recombine the signal before ACF
# essentially, each level I concatenate
# the detail coefs (i.e. the HPF values)
# to the beginning of the array
cD_sum = cD[0:cD_minlen] + cD_sum;
if [b for b in cA if b != 0.0] == []:
return no_audio_data()
# adding in the approximate data as well...
cA = signal.lfilter([0.01],[1 -0.99],cA);
cA = abs(cA);
cA = cA - numpy.mean(cA);
cD_sum = cA[0:cD_minlen] + cD_sum;
# ACF
correl = numpy.correlate(cD_sum,cD_sum,'full')
midpoint = len(correl) / 2
correl_midpoint_tmp = correl[midpoint:]
peak_ndx = peak_detect(correl_midpoint_tmp[min_ndx:max_ndx]);
if len(peak_ndx) > 1:
return no_audio_data()
peak_ndx_adjusted = peak_ndx[0]+min_ndx;
bpm = 60./ peak_ndx_adjusted * (fs/max_decimation)
print bpm
return bpm,correl
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Process .wav file to determine the Beats Per Minute.')
parser.add_argument('--filename', required=True,
help='.wav file for processing')
parser.add_argument('--window', type=float, default=3,
help='size of the the window (seconds) that will be scanned to determine the bpm. Typically less than 10 seconds. [3]')
args = parser.parse_args()
samps,fs = read_wav(args.filename)
data = []
correl=[]
bpm = 0
n=0;
nsamps = len(samps)
window_samps = int(args.window*fs)
samps_ndx = 0; #first sample in window_ndx
max_window_ndx = nsamps / window_samps;
bpms = numpy.zeros(max_window_ndx)
#iterate through all windows
for window_ndx in xrange(0,max_window_ndx):
#get a new set of samples
#print n,":",len(bpms),":",max_window_ndx,":",fs,":",nsamps,":",samps_ndx
data = samps[samps_ndx:samps_ndx+window_samps]
if not ((len(data) % window_samps) == 0):
raise AssertionError( str(len(data) ) )
bpm, correl_temp = bpm_detector(data,fs)
if bpm == None:
continue
bpms[window_ndx] = bpm
correl = correl_temp
#iterate at the end of the loop
samps_ndx = samps_ndx+window_samps;
n=n+1; #counter for debug...
bpm = numpy.median(bpms)
print 'Completed. Estimated Beats Per Minute:', bpm
n = range(0,len(correl))
plt.plot(n,abs(correl));
plt.show(False); #plot non-blocking
time.sleep(10);
plt.close();
Librosa has the librosa.beat.beat_track() method, but you need to supply an estimate of the BMP as the "start_bpm" parameter. Not sure how accurate it is, but perhaps worth a shot.
librosa is the package you are looking for. It contains extensive range of functions for audio analysis. librosa.beat.beat_track() and librosa.beat.tempo() functions will extract the required features for you.
Spectral features like chroma, MFCC, Zero-crossing rate, and rhythm features such as tempogram can also be obtained using the functions available in librosa.
Well i recently came across Vampy which is wrapper plugin that enables you to use Vamp plugins written in Python in any Vamp host. Vamp is an audio processing plugin system for plugins that extract descriptive information from audio data. Hope it helps.
发布评论
评论(6)
这个答案是一年后给出的,但无论如何,请记录在案。我发现了三个带有 python 绑定的音频库,可以从音频中提取特征。它们安装起来并不容易,因为它们实际上是用 C 语言编写的,您需要正确编译 python 绑定并将它们添加到要导入的路径中,但它们在这里:
This answer comes a year later, but anyway, for the record. I found three audio libraries with python bindings that extract features from audio. They are not that easy to install since they are really in C and you need to properly compile the python bindings and add them to the path to import, but here they are:
Echo Nest API 正是您所寻找的:
http://echonest.github.io/remix/
Python 绑定很丰富,但安装 Echo Nest 可能会很痛苦,因为团队似乎无法构建可靠的安装程序。
但是它不进行本地处理。相反,它计算音频指纹并上传歌曲给 Echo Nest 服务器,以便使用它们不公开的算法提取信息。
Echo Nest API is what you are looking for:
http://echonest.github.io/remix/
Python bindings are rich, though installing Echo Nest can be pain as the team does not seem to be able to build solid installers.
However it does not do local processing. Instead, it calculates audio fingerprint and uploads the song for Echo Nest servers for the information extraction using algorithms they don't expose.
我在@scaperot 这里找到了这段代码 这可以帮助你:
i've found this code by @scaperot here that could help you:
Librosa 有 librosa.beat.beat_track() 方法,但您需要提供 BMP 的估计值作为“start_bpm”参数。不确定它有多准确,但也许值得一试。
Librosa has the librosa.beat.beat_track() method, but you need to supply an estimate of the BMP as the "start_bpm" parameter. Not sure how accurate it is, but perhaps worth a shot.
librosa
是您正在寻找的包。它包含广泛的音频分析功能。librosa.beat.beat_track()
和librosa.beat.tempo()
函数将为您提取所需的功能。还可以使用 librosa 中提供的函数来获取色度、MFCC、过零率等光谱特征和节奏图等节奏特征。
librosa
is the package you are looking for. It contains extensive range of functions for audio analysis.librosa.beat.beat_track()
andlibrosa.beat.tempo()
functions will extract the required features for you.Spectral features like chroma, MFCC, Zero-crossing rate, and rhythm features such as tempogram can also be obtained using the functions available in
librosa
.好吧,我最近遇到了 Vampy 这是一个包装插件,使您能够使用用以下语言编写的 Vamp 插件任何 Vamp 主机中的 Python。 Vamp 是一个音频处理插件系统,用于从音频数据中提取描述性信息的插件。希望有帮助。
Well i recently came across Vampy which is wrapper plugin that enables you to use Vamp plugins written in Python in any Vamp host. Vamp is an audio processing plugin system for plugins that extract descriptive information from audio data. Hope it helps.