ValueError:找到具有0个样本的数组(Shape =(0,19)),而最少需要1个
因此,我是编程和机器学习的新手,并且正在使用我从日记垃圾邮件检测日记中找到的代码。当我尝试使用它时,即使我已经正确准备了数据,结果也被证明是错误。错误消息是“值:valueerror:找到的数组,具有0个样本(S)(Shape =(0,19)),而最少需要1个。” 有人可以帮我解决这个问题吗? [完整代码的链接在这里]( https://github.com/ijdutse/spd )
#!/usr/bin/env python3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from datetime import datetime
import preprocessor as p
import random, os, utils, smart_open, json, codecs, pickle, time
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.fftpack import fft
data_sources = ['I:\Data Penelitian\Iphone/iphone.json']
def main():
spd = Spd(data_sources) #class instantiation
start = time.process_time()
relevant_tweets = spd.detector(data_sources)
stop = time.process_time()
return relevant_tweets
class Spd:
""" some functions to accept raw files, extract relevant fields and filter our irrelevent content"""
def __init__(self, data_sources):
self.data_sources = data_sources
pass
# first function in the class:
def extractor(self, data_sources): # accept list of files consisting of raw tweets in form of json object
data_extracts = {'TweetID':[],'ScreenName':[],'RawTweets':[],'CreatedAt':[],'RetweetCount':[],\
'FollowersCount':[],'FriendsCount':[], 'StatusesCount':[],'FavouritesCount':[],\
'UserName':[],'Location':[],'AccountCreated':[],'Language':[],'Description':[],\
'UserURL':[],'VerifiedAccount':[],'CleanTweets':[],'UserID':[], 'TimeZone':[],'TweetFavouriteCount':[]}
non_english_tweets = 0 # keep track of the non-English tweets
with codecs.open('I:\Data Penelitian\Iphone/iphone.json', 'r') as f: # data_source is read from extractor() function
for line in f.readlines():
non_English = 0
try:
line = json.loads(line)
if line['lang'] in ['en','en-gb','en-GB','en-AU','en-IN','en_US']:
data_extracts['Language'].append(line['Language'])
data_extracts['TweetID'].append(line['TweetID'])
data_extracts['RawTweets'].append(line['RawTweets'])
data_extracts['CleanTweets'].append(p.clean(line['RawTweets']))
data_extracts['CreatedAt'].append(line['CreatedAt'])
data_extracts['AccountCreated'].append(line['AccountCreated'])
data_extracts['ScreenName'].append(line['ScreenName'])
data_extracts['RetweetCount'].append(line['RetweetCount'])
data_extracts['FollowersCount'].append(line['FollowersCount'])
data_extracts['FriendsCount'].append(line['FriendsCount'])
data_extracts['StatusesCount'].append(line['StatusesCount'])
data_extracts['FavouritesCount'].append(line['FavouritesCount'])
data_extracts['UserName'].append(line['UserName'])
data_extracts['Location'].append(line['Location'])
data_extracts['Description'].append(line['Description'])
data_extracts['UserURL'].append(line['UserURL'])
data_extracts['VerifiedAccount'].append(line['VerifiedAccount'])
data_extracts['UserID'].append(line['UserID'])
data_extracts['TimeZone'].append(line['TimeZone'])
data_extracts['TweetFavouriteCount'].append(line['TweetFavouriteCount'])
else:
non_english_tweets +=1
except:
continue
df0 = pd.DataFrame(data_extracts) #convert data extracts to pandas DataFrame
df0['CreatedAt']=pd.to_datetime(data_extracts['CreatedAt'],errors='coerce') # convert to datetime
df0['AccountCreated']=pd.to_datetime(data_extracts['AccountCreated'],errors='coerce')
df0 = df0.dropna(subset=['AccountCreated','CreatedAt']) # drop na in datetime
AccountAge = [] # compute the account age of accounts
date_format = "%Y-%m-%d %H:%M:%S"
for dr,dc in zip(df0.CreatedAt, df0.AccountCreated):
#try:
dr = str(dr)
dc = str(dc)
d1 = datetime.strptime(dr,date_format)
d2 = datetime.strptime(dc,date_format)
dif = d1 - d2
AccountAge.append(dif.days)
#except:
#continue
df0['AccountAge']=AccountAge
# add/define additional features ...
df0['Retweets'] = df0.RawTweets.apply(lambda x: str(x).split()[0]=='RT' )
df0['RawTweetsLen'] = df0.RawTweets.apply(lambda x: len(str(x))) # modified
df0['DescriptionLen'] = df0.Description.apply(lambda x: len(str(x)))
df0['UserNameLen'] = df0.UserName.apply(lambda x: len(str(x)))
df0['ScreenNameLen'] = df0.ScreenName.apply(lambda x: len(str(x)))
df0['LocationLen'] = df0.Location.apply(lambda x: len(str(x)))
df0['Activeness'] = df0.StatusesCount.truediv(df0.AccountAge)
df0['Friendship'] = df0.FriendsCount.truediv(df0.FollowersCount)
df0['Followership'] = df0.FollowersCount.truediv(df0.FriendsCount)
df0['Interestingness'] = df0.FavouritesCount.truediv(df0.StatusesCount)
df0['BidirFriendship'] = (df0.FriendsCount + df0.FollowersCount).truediv(df0.FriendsCount)
df0['BidirFollowership'] = (df0.FriendsCount + df0.FollowersCount).truediv(df0.FollowersCount)
df0['NamesRatio'] = df0.ScreenNameLen.truediv(df0.UserNameLen)
df0['CleanTweetsLen'] = df0.CleanTweets.apply(lambda x: len(str(x)))
df0['LexRichness'] = df0.CleanTweetsLen.truediv(df0.RawTweetsLen)
# Remove all RTs, set UserID as index and save relevant files:
df0 = df0[df0.Retweets.values==False] # remove retweets
df0 = df0.set_index('UserID')
df0 = df0[~df0.index.duplicated()] # remove duplicates in the tweet
#df0.to_csv(data_source[:15]+'all_extracts.csv') #save all extracts as csv
df0.to_csv(data_sources[:5]+'all_extracts.csv') #save all extracts as csv
with open(data_sources[:5]+'non_English.txt','w') as d: # save count of non-English tweets
d.write('{}'.format(non_english_tweets))
d.close()
return df0
def detector(self, data_sources): # accept list of raw tweets as json objects
self.data_sources = data_sources
for data_sources in data_sources:
self.data_sources = data_sources
df0 = self.extractor(data_sources)
#drop fields not required for predicition
X = df0.drop(['Language','TweetID','RawTweets','CleanTweets','CreatedAt','AccountCreated','ScreenName',\
'Retweets','UserName','Location','Description','UserURL','VerifiedAccount','RetweetCount','TimeZone','TweetFavouriteCount'], axis=1)
X = X.replace([np.inf,-np.inf],np.nan) # replace infinity values to avoid 0 division ...
X = X.dropna()
# reload the trained model for use:
spd_filter=pickle.load(open('trained_rf.pkl','rb'))
PredictedClass = spd_filter.predict(X) # Predict spam or automated accounts/tweets:
X['PredictedClass'] = PredictedClass # include the predicted class in the dataframe
nonspam = df0.loc[X.PredictedClass.values==1] # sort out the nonspam accounts
spam = df0.loc[X.PredictedClass.values==0] # sort out spam/automated accounts
#relevant_tweets = nonspam[['CreatedAt', 'CleanTweets']]
relevant_tweets = nonspam[['CreatedAt','AccountCreated','ScreenName','Location','TimeZone','Description','VerifiedAccount','RawTweets', 'CleanTweets','TweetFavouriteCount','Retweets']]
relevant_tweets = relevant_tweets.reset_index() # reset index and remove it from the dataframe
#relevant_tweets = relevant_tweets.drop('UserID', axis=1)
# save files:
X.to_csv(data_source[:5]+'_all_predicted_classes.csv') #save all extracts as csv, used to be 15
nonspam.to_csv(data_source[:5]+'_nonspam_accounts.csv')
spam.to_csv(data_source[:5]+'_spam_accounts.csv')
relevant_tweets.to_csv(data_source[:5]+'_relevant_tweets.csv') # relevant tweets for subsequent analysis
return relevant_tweets # or return relevant_tweets, nonspam, spam
if __name__ =='__main__':
main()
追溯错误如下
ValueError Traceback (most recent call last)
<ipython-input-2-5dc56f49d005> in <module>
142
143 if __name__ =='__main__':
--> 144 main()
<ipython-input-2-5dc56f49d005> in main()
18 spd = Spd(data_sources) #class instantiation
19 start = time.process_time()
---> 20 relevant_tweets = spd.detector(data_sources)
21 stop = time.process_time()
22 return relevant_tweets
<ipython-input-2-5dc56f49d005> in detector(self, data_sources)
126 # reload the trained model for use:
127 spd_filter=pickle.load(open('trained_rf.pkl','rb'))
--> 128 PredictedClass = spd_filter.predict(X) # Predict spam or automated accounts/tweets:
129 X['PredictedClass'] = PredictedClass # include the predicted class in the dataframe
130 nonspam = df0.loc[X.PredictedClass.values==1] # sort out the nonspam accounts
~\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in predict(self, X)
543 The predicted classes.
544 """
--> 545 proba = self.predict_proba(X)
546
547 if self.n_outputs_ == 1:
~\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in predict_proba(self, X)
586 check_is_fitted(self, 'estimators_')
587 # Check data
--> 588 X = self._validate_X_predict(X)
589
590 # Assign chunk of trees to jobs
~\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in _validate_X_predict(self, X)
357 "call `fit` before exploiting the model.")
358
--> 359 return self.estimators_[0]._validate_X_predict(X, check_input=True)
360
361 @property
~\Anaconda3\lib\site-packages\sklearn\tree\tree.py in _validate_X_predict(self, X, check_input)
389 """Validate X whenever one tries to predict, apply, predict_proba"""
390 if check_input:
--> 391 X = check_array(X, dtype=DTYPE, accept_sparse="csr")
392 if issparse(X) and (X.indices.dtype != np.intc or
393 X.indptr.dtype != np.intc):
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
548 " minimum of %d is required%s."
549 % (n_samples, array.shape, ensure_min_samples,
--> 550 context))
551
552 if ensure_min_features > 0 and array.ndim == 2:
ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required.
So I'm new at programming and machine learning, and I'm using this code I found from a journal for spam detection. When I try to use it, the result turns out to be error, even though I already prepared the data correctly. The error message is 'ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required.'
Can anyone please help me out with this issue?
[The link for the complete code is here] (https://github.com/ijdutse/spd)
#!/usr/bin/env python3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from datetime import datetime
import preprocessor as p
import random, os, utils, smart_open, json, codecs, pickle, time
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.fftpack import fft
data_sources = ['I:\Data Penelitian\Iphone/iphone.json']
def main():
spd = Spd(data_sources) #class instantiation
start = time.process_time()
relevant_tweets = spd.detector(data_sources)
stop = time.process_time()
return relevant_tweets
class Spd:
""" some functions to accept raw files, extract relevant fields and filter our irrelevent content"""
def __init__(self, data_sources):
self.data_sources = data_sources
pass
# first function in the class:
def extractor(self, data_sources): # accept list of files consisting of raw tweets in form of json object
data_extracts = {'TweetID':[],'ScreenName':[],'RawTweets':[],'CreatedAt':[],'RetweetCount':[],\
'FollowersCount':[],'FriendsCount':[], 'StatusesCount':[],'FavouritesCount':[],\
'UserName':[],'Location':[],'AccountCreated':[],'Language':[],'Description':[],\
'UserURL':[],'VerifiedAccount':[],'CleanTweets':[],'UserID':[], 'TimeZone':[],'TweetFavouriteCount':[]}
non_english_tweets = 0 # keep track of the non-English tweets
with codecs.open('I:\Data Penelitian\Iphone/iphone.json', 'r') as f: # data_source is read from extractor() function
for line in f.readlines():
non_English = 0
try:
line = json.loads(line)
if line['lang'] in ['en','en-gb','en-GB','en-AU','en-IN','en_US']:
data_extracts['Language'].append(line['Language'])
data_extracts['TweetID'].append(line['TweetID'])
data_extracts['RawTweets'].append(line['RawTweets'])
data_extracts['CleanTweets'].append(p.clean(line['RawTweets']))
data_extracts['CreatedAt'].append(line['CreatedAt'])
data_extracts['AccountCreated'].append(line['AccountCreated'])
data_extracts['ScreenName'].append(line['ScreenName'])
data_extracts['RetweetCount'].append(line['RetweetCount'])
data_extracts['FollowersCount'].append(line['FollowersCount'])
data_extracts['FriendsCount'].append(line['FriendsCount'])
data_extracts['StatusesCount'].append(line['StatusesCount'])
data_extracts['FavouritesCount'].append(line['FavouritesCount'])
data_extracts['UserName'].append(line['UserName'])
data_extracts['Location'].append(line['Location'])
data_extracts['Description'].append(line['Description'])
data_extracts['UserURL'].append(line['UserURL'])
data_extracts['VerifiedAccount'].append(line['VerifiedAccount'])
data_extracts['UserID'].append(line['UserID'])
data_extracts['TimeZone'].append(line['TimeZone'])
data_extracts['TweetFavouriteCount'].append(line['TweetFavouriteCount'])
else:
non_english_tweets +=1
except:
continue
df0 = pd.DataFrame(data_extracts) #convert data extracts to pandas DataFrame
df0['CreatedAt']=pd.to_datetime(data_extracts['CreatedAt'],errors='coerce') # convert to datetime
df0['AccountCreated']=pd.to_datetime(data_extracts['AccountCreated'],errors='coerce')
df0 = df0.dropna(subset=['AccountCreated','CreatedAt']) # drop na in datetime
AccountAge = [] # compute the account age of accounts
date_format = "%Y-%m-%d %H:%M:%S"
for dr,dc in zip(df0.CreatedAt, df0.AccountCreated):
#try:
dr = str(dr)
dc = str(dc)
d1 = datetime.strptime(dr,date_format)
d2 = datetime.strptime(dc,date_format)
dif = d1 - d2
AccountAge.append(dif.days)
#except:
#continue
df0['AccountAge']=AccountAge
# add/define additional features ...
df0['Retweets'] = df0.RawTweets.apply(lambda x: str(x).split()[0]=='RT' )
df0['RawTweetsLen'] = df0.RawTweets.apply(lambda x: len(str(x))) # modified
df0['DescriptionLen'] = df0.Description.apply(lambda x: len(str(x)))
df0['UserNameLen'] = df0.UserName.apply(lambda x: len(str(x)))
df0['ScreenNameLen'] = df0.ScreenName.apply(lambda x: len(str(x)))
df0['LocationLen'] = df0.Location.apply(lambda x: len(str(x)))
df0['Activeness'] = df0.StatusesCount.truediv(df0.AccountAge)
df0['Friendship'] = df0.FriendsCount.truediv(df0.FollowersCount)
df0['Followership'] = df0.FollowersCount.truediv(df0.FriendsCount)
df0['Interestingness'] = df0.FavouritesCount.truediv(df0.StatusesCount)
df0['BidirFriendship'] = (df0.FriendsCount + df0.FollowersCount).truediv(df0.FriendsCount)
df0['BidirFollowership'] = (df0.FriendsCount + df0.FollowersCount).truediv(df0.FollowersCount)
df0['NamesRatio'] = df0.ScreenNameLen.truediv(df0.UserNameLen)
df0['CleanTweetsLen'] = df0.CleanTweets.apply(lambda x: len(str(x)))
df0['LexRichness'] = df0.CleanTweetsLen.truediv(df0.RawTweetsLen)
# Remove all RTs, set UserID as index and save relevant files:
df0 = df0[df0.Retweets.values==False] # remove retweets
df0 = df0.set_index('UserID')
df0 = df0[~df0.index.duplicated()] # remove duplicates in the tweet
#df0.to_csv(data_source[:15]+'all_extracts.csv') #save all extracts as csv
df0.to_csv(data_sources[:5]+'all_extracts.csv') #save all extracts as csv
with open(data_sources[:5]+'non_English.txt','w') as d: # save count of non-English tweets
d.write('{}'.format(non_english_tweets))
d.close()
return df0
def detector(self, data_sources): # accept list of raw tweets as json objects
self.data_sources = data_sources
for data_sources in data_sources:
self.data_sources = data_sources
df0 = self.extractor(data_sources)
#drop fields not required for predicition
X = df0.drop(['Language','TweetID','RawTweets','CleanTweets','CreatedAt','AccountCreated','ScreenName',\
'Retweets','UserName','Location','Description','UserURL','VerifiedAccount','RetweetCount','TimeZone','TweetFavouriteCount'], axis=1)
X = X.replace([np.inf,-np.inf],np.nan) # replace infinity values to avoid 0 division ...
X = X.dropna()
# reload the trained model for use:
spd_filter=pickle.load(open('trained_rf.pkl','rb'))
PredictedClass = spd_filter.predict(X) # Predict spam or automated accounts/tweets:
X['PredictedClass'] = PredictedClass # include the predicted class in the dataframe
nonspam = df0.loc[X.PredictedClass.values==1] # sort out the nonspam accounts
spam = df0.loc[X.PredictedClass.values==0] # sort out spam/automated accounts
#relevant_tweets = nonspam[['CreatedAt', 'CleanTweets']]
relevant_tweets = nonspam[['CreatedAt','AccountCreated','ScreenName','Location','TimeZone','Description','VerifiedAccount','RawTweets', 'CleanTweets','TweetFavouriteCount','Retweets']]
relevant_tweets = relevant_tweets.reset_index() # reset index and remove it from the dataframe
#relevant_tweets = relevant_tweets.drop('UserID', axis=1)
# save files:
X.to_csv(data_source[:5]+'_all_predicted_classes.csv') #save all extracts as csv, used to be 15
nonspam.to_csv(data_source[:5]+'_nonspam_accounts.csv')
spam.to_csv(data_source[:5]+'_spam_accounts.csv')
relevant_tweets.to_csv(data_source[:5]+'_relevant_tweets.csv') # relevant tweets for subsequent analysis
return relevant_tweets # or return relevant_tweets, nonspam, spam
if __name__ =='__main__':
main()
The traceback error is as follow
ValueError Traceback (most recent call last)
<ipython-input-2-5dc56f49d005> in <module>
142
143 if __name__ =='__main__':
--> 144 main()
<ipython-input-2-5dc56f49d005> in main()
18 spd = Spd(data_sources) #class instantiation
19 start = time.process_time()
---> 20 relevant_tweets = spd.detector(data_sources)
21 stop = time.process_time()
22 return relevant_tweets
<ipython-input-2-5dc56f49d005> in detector(self, data_sources)
126 # reload the trained model for use:
127 spd_filter=pickle.load(open('trained_rf.pkl','rb'))
--> 128 PredictedClass = spd_filter.predict(X) # Predict spam or automated accounts/tweets:
129 X['PredictedClass'] = PredictedClass # include the predicted class in the dataframe
130 nonspam = df0.loc[X.PredictedClass.values==1] # sort out the nonspam accounts
~\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in predict(self, X)
543 The predicted classes.
544 """
--> 545 proba = self.predict_proba(X)
546
547 if self.n_outputs_ == 1:
~\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in predict_proba(self, X)
586 check_is_fitted(self, 'estimators_')
587 # Check data
--> 588 X = self._validate_X_predict(X)
589
590 # Assign chunk of trees to jobs
~\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in _validate_X_predict(self, X)
357 "call `fit` before exploiting the model.")
358
--> 359 return self.estimators_[0]._validate_X_predict(X, check_input=True)
360
361 @property
~\Anaconda3\lib\site-packages\sklearn\tree\tree.py in _validate_X_predict(self, X, check_input)
389 """Validate X whenever one tries to predict, apply, predict_proba"""
390 if check_input:
--> 391 X = check_array(X, dtype=DTYPE, accept_sparse="csr")
392 if issparse(X) and (X.indices.dtype != np.intc or
393 X.indptr.dtype != np.intc):
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
548 " minimum of %d is required%s."
549 % (n_samples, array.shape, ensure_min_samples,
--> 550 context))
551
552 if ensure_min_features > 0 and array.ndim == 2:
ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required.
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论