ValueError：找到具有0个样本的数组（Shape =（0，19）），而最少需要1个

发布于 2025-02-13 06:52:26 字数 11971 浏览 1 评论 0原文

因此，我是编程和机器学习的新手，并且正在使用我从日记垃圾邮件检测日记中找到的代码。当我尝试使用它时，即使我已经正确准备了数据，结果也被证明是错误。错误消息是“值：valueerror：找到的数组，具有0个样本（S）（Shape =（0，19）），而最少需要1个。” 有人可以帮我解决这个问题吗？ [完整代码的链接在这里]（ https：//github.com/ijdutse/spd ）

#!/usr/bin/env python3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from datetime import datetime
import preprocessor as p
import random, os, utils, smart_open, json, codecs, pickle, time
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.fftpack import fft

data_sources = ['I:\Data Penelitian\Iphone/iphone.json']

def main():
    spd = Spd(data_sources) #class instantiation
    start = time.process_time()
    relevant_tweets = spd.detector(data_sources)
    stop = time.process_time()
    return relevant_tweets




class Spd:
    """ some functions to accept raw files, extract relevant fields and filter our irrelevent content"""
    def __init__(self, data_sources):
        self.data_sources = data_sources
    pass
        
    # first function in the class:
    def extractor(self, data_sources): # accept list of files consisting of raw tweets in form of json object
        data_extracts = {'TweetID':[],'ScreenName':[],'RawTweets':[],'CreatedAt':[],'RetweetCount':[],\
                         'FollowersCount':[],'FriendsCount':[], 'StatusesCount':[],'FavouritesCount':[],\
                         'UserName':[],'Location':[],'AccountCreated':[],'Language':[],'Description':[],\
                         'UserURL':[],'VerifiedAccount':[],'CleanTweets':[],'UserID':[], 'TimeZone':[],'TweetFavouriteCount':[]}
        non_english_tweets = 0 # keep track of the non-English tweets
        with codecs.open('I:\Data Penelitian\Iphone/iphone.json', 'r') as f: # data_source is read from extractor() function
            for line in f.readlines():
                non_English = 0
                try:
                    line = json.loads(line)
                    if line['lang'] in ['en','en-gb','en-GB','en-AU','en-IN','en_US']:
                        data_extracts['Language'].append(line['Language'])
                        data_extracts['TweetID'].append(line['TweetID'])
                        data_extracts['RawTweets'].append(line['RawTweets'])
                        data_extracts['CleanTweets'].append(p.clean(line['RawTweets']))
                        data_extracts['CreatedAt'].append(line['CreatedAt'])
                        data_extracts['AccountCreated'].append(line['AccountCreated'])                       
                        data_extracts['ScreenName'].append(line['ScreenName'])                          
                        data_extracts['RetweetCount'].append(line['RetweetCount'])
                        data_extracts['FollowersCount'].append(line['FollowersCount'])
                        data_extracts['FriendsCount'].append(line['FriendsCount'])
                        data_extracts['StatusesCount'].append(line['StatusesCount'])
                        data_extracts['FavouritesCount'].append(line['FavouritesCount'])
                        data_extracts['UserName'].append(line['UserName'])
                        data_extracts['Location'].append(line['Location'])
                        data_extracts['Description'].append(line['Description'])
                        data_extracts['UserURL'].append(line['UserURL'])
                        data_extracts['VerifiedAccount'].append(line['VerifiedAccount'])
                        data_extracts['UserID'].append(line['UserID'])
                        data_extracts['TimeZone'].append(line['TimeZone'])
                        data_extracts['TweetFavouriteCount'].append(line['TweetFavouriteCount'])
                    else:
                        non_english_tweets +=1
                except:
                    continue
            df0 = pd.DataFrame(data_extracts) #convert data extracts to pandas DataFrame
            df0['CreatedAt']=pd.to_datetime(data_extracts['CreatedAt'],errors='coerce') # convert to datetime
            df0['AccountCreated']=pd.to_datetime(data_extracts['AccountCreated'],errors='coerce')
            df0 = df0.dropna(subset=['AccountCreated','CreatedAt']) # drop na in datetime
            AccountAge = [] # compute the account age of accounts
            date_format = "%Y-%m-%d  %H:%M:%S"
            for dr,dc in zip(df0.CreatedAt, df0.AccountCreated):
                #try:
                dr = str(dr)
                dc = str(dc)
                d1 = datetime.strptime(dr,date_format)
                d2 = datetime.strptime(dc,date_format)
                dif = d1 - d2
                AccountAge.append(dif.days)
                #except:
                    #continue
            df0['AccountAge']=AccountAge
            # add/define additional features ...
            df0['Retweets'] = df0.RawTweets.apply(lambda x: str(x).split()[0]=='RT' )
            df0['RawTweetsLen'] = df0.RawTweets.apply(lambda x: len(str(x))) # modified
            df0['DescriptionLen'] = df0.Description.apply(lambda x: len(str(x)))
            df0['UserNameLen'] = df0.UserName.apply(lambda x: len(str(x)))
            df0['ScreenNameLen'] = df0.ScreenName.apply(lambda x: len(str(x)))
            df0['LocationLen'] = df0.Location.apply(lambda x: len(str(x)))
            df0['Activeness'] = df0.StatusesCount.truediv(df0.AccountAge)
            df0['Friendship'] = df0.FriendsCount.truediv(df0.FollowersCount)
            df0['Followership'] = df0.FollowersCount.truediv(df0.FriendsCount)
            df0['Interestingness'] = df0.FavouritesCount.truediv(df0.StatusesCount)
            df0['BidirFriendship'] = (df0.FriendsCount + df0.FollowersCount).truediv(df0.FriendsCount)
            df0['BidirFollowership'] = (df0.FriendsCount + df0.FollowersCount).truediv(df0.FollowersCount)
            df0['NamesRatio'] = df0.ScreenNameLen.truediv(df0.UserNameLen)
            df0['CleanTweetsLen'] = df0.CleanTweets.apply(lambda x: len(str(x)))
            df0['LexRichness'] = df0.CleanTweetsLen.truediv(df0.RawTweetsLen)       
            # Remove all RTs, set UserID as index and save relevant files:
            df0 = df0[df0.Retweets.values==False] # remove retweets
            df0 = df0.set_index('UserID')
            df0 = df0[~df0.index.duplicated()] # remove duplicates in the tweet
            #df0.to_csv(data_source[:15]+'all_extracts.csv') #save all extracts as csv
            df0.to_csv(data_sources[:5]+'all_extracts.csv') #save all extracts as csv 
            with open(data_sources[:5]+'non_English.txt','w') as d: # save count of non-English tweets
                d.write('{}'.format(non_english_tweets))
                d.close()
        return df0

    
    def detector(self, data_sources): # accept list of raw tweets as json objects
        self.data_sources = data_sources
        for data_sources in data_sources:
            self.data_sources = data_sources
            df0 = self.extractor(data_sources)
            #drop fields not required for predicition
            X = df0.drop(['Language','TweetID','RawTweets','CleanTweets','CreatedAt','AccountCreated','ScreenName',\
                 'Retweets','UserName','Location','Description','UserURL','VerifiedAccount','RetweetCount','TimeZone','TweetFavouriteCount'], axis=1)
            X = X.replace([np.inf,-np.inf],np.nan) # replace infinity values to avoid 0 division ...
            X = X.dropna()
            # reload the trained model for use:
            spd_filter=pickle.load(open('trained_rf.pkl','rb'))
            PredictedClass = spd_filter.predict(X) # Predict spam or automated accounts/tweets:
            X['PredictedClass'] = PredictedClass # include the predicted class in the dataframe
            nonspam = df0.loc[X.PredictedClass.values==1] # sort out the nonspam accounts
            spam = df0.loc[X.PredictedClass.values==0] # sort out spam/automated accounts
            #relevant_tweets = nonspam[['CreatedAt', 'CleanTweets']]
            relevant_tweets = nonspam[['CreatedAt','AccountCreated','ScreenName','Location','TimeZone','Description','VerifiedAccount','RawTweets', 'CleanTweets','TweetFavouriteCount','Retweets']]
            relevant_tweets = relevant_tweets.reset_index() # reset index and remove it from the dataframe
            #relevant_tweets = relevant_tweets.drop('UserID', axis=1) 
            # save files:
            X.to_csv(data_source[:5]+'_all_predicted_classes.csv') #save all extracts as csv, used to be 15
            nonspam.to_csv(data_source[:5]+'_nonspam_accounts.csv')
            spam.to_csv(data_source[:5]+'_spam_accounts.csv')
            relevant_tweets.to_csv(data_source[:5]+'_relevant_tweets.csv') # relevant tweets for subsequent analysis
        return relevant_tweets # or return relevant_tweets, nonspam, spam

if __name__ =='__main__':
    main()

追溯错误如下

ValueError                                Traceback (most recent call last)
<ipython-input-2-5dc56f49d005> in <module>
    142 
    143 if __name__ =='__main__':
--> 144     main()

<ipython-input-2-5dc56f49d005> in main()
     18     spd = Spd(data_sources) #class instantiation
     19     start = time.process_time()
---> 20     relevant_tweets = spd.detector(data_sources)
     21     stop = time.process_time()
     22     return relevant_tweets

<ipython-input-2-5dc56f49d005> in detector(self, data_sources)
    126             # reload the trained model for use:
    127             spd_filter=pickle.load(open('trained_rf.pkl','rb'))
--> 128             PredictedClass = spd_filter.predict(X) # Predict spam or automated accounts/tweets:
    129             X['PredictedClass'] = PredictedClass # include the predicted class in the dataframe
    130             nonspam = df0.loc[X.PredictedClass.values==1] # sort out the nonspam accounts

~\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in predict(self, X)
    543             The predicted classes.
    544         """
--> 545         proba = self.predict_proba(X)
    546 
    547         if self.n_outputs_ == 1:

~\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in predict_proba(self, X)
    586         check_is_fitted(self, 'estimators_')
    587         # Check data
--> 588         X = self._validate_X_predict(X)
    589 
    590         # Assign chunk of trees to jobs

~\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in _validate_X_predict(self, X)
    357                                  "call `fit` before exploiting the model.")
    358 
--> 359         return self.estimators_[0]._validate_X_predict(X, check_input=True)
    360 
    361     @property

~\Anaconda3\lib\site-packages\sklearn\tree\tree.py in _validate_X_predict(self, X, check_input)
    389         """Validate X whenever one tries to predict, apply, predict_proba"""
    390         if check_input:
--> 391             X = check_array(X, dtype=DTYPE, accept_sparse="csr")
    392             if issparse(X) and (X.indices.dtype != np.intc or
    393                                 X.indptr.dtype != np.intc):

~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    548                              " minimum of %d is required%s."
    549                              % (n_samples, array.shape, ensure_min_samples,
--> 550                                 context))
    551 
    552     if ensure_min_features > 0 and array.ndim == 2:

ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required.

原文

So I'm new at programming and machine learning, and I'm using this code I found from a journal for spam detection. When I try to use it, the result turns out to be error, even though I already prepared the data correctly. The error message is 'ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required.'
Can anyone please help me out with this issue?
[The link for the complete code is here] (https://github.com/ijdutse/spd)

#!/usr/bin/env python3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from datetime import datetime
import preprocessor as p
import random, os, utils, smart_open, json, codecs, pickle, time
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.fftpack import fft

data_sources = ['I:\Data Penelitian\Iphone/iphone.json']

def main():
    spd = Spd(data_sources) #class instantiation
    start = time.process_time()
    relevant_tweets = spd.detector(data_sources)
    stop = time.process_time()
    return relevant_tweets




class Spd:
    """ some functions to accept raw files, extract relevant fields and filter our irrelevent content"""
    def __init__(self, data_sources):
        self.data_sources = data_sources
    pass
        
    # first function in the class:
    def extractor(self, data_sources): # accept list of files consisting of raw tweets in form of json object
        data_extracts = {'TweetID':[],'ScreenName':[],'RawTweets':[],'CreatedAt':[],'RetweetCount':[],\
                         'FollowersCount':[],'FriendsCount':[], 'StatusesCount':[],'FavouritesCount':[],\
                         'UserName':[],'Location':[],'AccountCreated':[],'Language':[],'Description':[],\
                         'UserURL':[],'VerifiedAccount':[],'CleanTweets':[],'UserID':[], 'TimeZone':[],'TweetFavouriteCount':[]}
        non_english_tweets = 0 # keep track of the non-English tweets
        with codecs.open('I:\Data Penelitian\Iphone/iphone.json', 'r') as f: # data_source is read from extractor() function
            for line in f.readlines():
                non_English = 0
                try:
                    line = json.loads(line)
                    if line['lang'] in ['en','en-gb','en-GB','en-AU','en-IN','en_US']:
                        data_extracts['Language'].append(line['Language'])
                        data_extracts['TweetID'].append(line['TweetID'])
                        data_extracts['RawTweets'].append(line['RawTweets'])
                        data_extracts['CleanTweets'].append(p.clean(line['RawTweets']))
                        data_extracts['CreatedAt'].append(line['CreatedAt'])
                        data_extracts['AccountCreated'].append(line['AccountCreated'])                       
                        data_extracts['ScreenName'].append(line['ScreenName'])                          
                        data_extracts['RetweetCount'].append(line['RetweetCount'])
                        data_extracts['FollowersCount'].append(line['FollowersCount'])
                        data_extracts['FriendsCount'].append(line['FriendsCount'])
                        data_extracts['StatusesCount'].append(line['StatusesCount'])
                        data_extracts['FavouritesCount'].append(line['FavouritesCount'])
                        data_extracts['UserName'].append(line['UserName'])
                        data_extracts['Location'].append(line['Location'])
                        data_extracts['Description'].append(line['Description'])
                        data_extracts['UserURL'].append(line['UserURL'])
                        data_extracts['VerifiedAccount'].append(line['VerifiedAccount'])
                        data_extracts['UserID'].append(line['UserID'])
                        data_extracts['TimeZone'].append(line['TimeZone'])
                        data_extracts['TweetFavouriteCount'].append(line['TweetFavouriteCount'])
                    else:
                        non_english_tweets +=1
                except:
                    continue
            df0 = pd.DataFrame(data_extracts) #convert data extracts to pandas DataFrame
            df0['CreatedAt']=pd.to_datetime(data_extracts['CreatedAt'],errors='coerce') # convert to datetime
            df0['AccountCreated']=pd.to_datetime(data_extracts['AccountCreated'],errors='coerce')
            df0 = df0.dropna(subset=['AccountCreated','CreatedAt']) # drop na in datetime
            AccountAge = [] # compute the account age of accounts
            date_format = "%Y-%m-%d  %H:%M:%S"
            for dr,dc in zip(df0.CreatedAt, df0.AccountCreated):
                #try:
                dr = str(dr)
                dc = str(dc)
                d1 = datetime.strptime(dr,date_format)
                d2 = datetime.strptime(dc,date_format)
                dif = d1 - d2
                AccountAge.append(dif.days)
                #except:
                    #continue
            df0['AccountAge']=AccountAge
            # add/define additional features ...
            df0['Retweets'] = df0.RawTweets.apply(lambda x: str(x).split()[0]=='RT' )
            df0['RawTweetsLen'] = df0.RawTweets.apply(lambda x: len(str(x))) # modified
            df0['DescriptionLen'] = df0.Description.apply(lambda x: len(str(x)))
            df0['UserNameLen'] = df0.UserName.apply(lambda x: len(str(x)))
            df0['ScreenNameLen'] = df0.ScreenName.apply(lambda x: len(str(x)))
            df0['LocationLen'] = df0.Location.apply(lambda x: len(str(x)))
            df0['Activeness'] = df0.StatusesCount.truediv(df0.AccountAge)
            df0['Friendship'] = df0.FriendsCount.truediv(df0.FollowersCount)
            df0['Followership'] = df0.FollowersCount.truediv(df0.FriendsCount)
            df0['Interestingness'] = df0.FavouritesCount.truediv(df0.StatusesCount)
            df0['BidirFriendship'] = (df0.FriendsCount + df0.FollowersCount).truediv(df0.FriendsCount)
            df0['BidirFollowership'] = (df0.FriendsCount + df0.FollowersCount).truediv(df0.FollowersCount)
            df0['NamesRatio'] = df0.ScreenNameLen.truediv(df0.UserNameLen)
            df0['CleanTweetsLen'] = df0.CleanTweets.apply(lambda x: len(str(x)))
            df0['LexRichness'] = df0.CleanTweetsLen.truediv(df0.RawTweetsLen)       
            # Remove all RTs, set UserID as index and save relevant files:
            df0 = df0[df0.Retweets.values==False] # remove retweets
            df0 = df0.set_index('UserID')
            df0 = df0[~df0.index.duplicated()] # remove duplicates in the tweet
            #df0.to_csv(data_source[:15]+'all_extracts.csv') #save all extracts as csv
            df0.to_csv(data_sources[:5]+'all_extracts.csv') #save all extracts as csv 
            with open(data_sources[:5]+'non_English.txt','w') as d: # save count of non-English tweets
                d.write('{}'.format(non_english_tweets))
                d.close()
        return df0

    
    def detector(self, data_sources): # accept list of raw tweets as json objects
        self.data_sources = data_sources
        for data_sources in data_sources:
            self.data_sources = data_sources
            df0 = self.extractor(data_sources)
            #drop fields not required for predicition
            X = df0.drop(['Language','TweetID','RawTweets','CleanTweets','CreatedAt','AccountCreated','ScreenName',\
                 'Retweets','UserName','Location','Description','UserURL','VerifiedAccount','RetweetCount','TimeZone','TweetFavouriteCount'], axis=1)
            X = X.replace([np.inf,-np.inf],np.nan) # replace infinity values to avoid 0 division ...
            X = X.dropna()
            # reload the trained model for use:
            spd_filter=pickle.load(open('trained_rf.pkl','rb'))
            PredictedClass = spd_filter.predict(X) # Predict spam or automated accounts/tweets:
            X['PredictedClass'] = PredictedClass # include the predicted class in the dataframe
            nonspam = df0.loc[X.PredictedClass.values==1] # sort out the nonspam accounts
            spam = df0.loc[X.PredictedClass.values==0] # sort out spam/automated accounts
            #relevant_tweets = nonspam[['CreatedAt', 'CleanTweets']]
            relevant_tweets = nonspam[['CreatedAt','AccountCreated','ScreenName','Location','TimeZone','Description','VerifiedAccount','RawTweets', 'CleanTweets','TweetFavouriteCount','Retweets']]
            relevant_tweets = relevant_tweets.reset_index() # reset index and remove it from the dataframe
            #relevant_tweets = relevant_tweets.drop('UserID', axis=1) 
            # save files:
            X.to_csv(data_source[:5]+'_all_predicted_classes.csv') #save all extracts as csv, used to be 15
            nonspam.to_csv(data_source[:5]+'_nonspam_accounts.csv')
            spam.to_csv(data_source[:5]+'_spam_accounts.csv')
            relevant_tweets.to_csv(data_source[:5]+'_relevant_tweets.csv') # relevant tweets for subsequent analysis
        return relevant_tweets # or return relevant_tweets, nonspam, spam

if __name__ =='__main__':
    main()

The traceback error is as follow

ValueError                                Traceback (most recent call last)
<ipython-input-2-5dc56f49d005> in <module>
    142 
    143 if __name__ =='__main__':
--> 144     main()

<ipython-input-2-5dc56f49d005> in main()
     18     spd = Spd(data_sources) #class instantiation
     19     start = time.process_time()
---> 20     relevant_tweets = spd.detector(data_sources)
     21     stop = time.process_time()
     22     return relevant_tweets

<ipython-input-2-5dc56f49d005> in detector(self, data_sources)
    126             # reload the trained model for use:
    127             spd_filter=pickle.load(open('trained_rf.pkl','rb'))
--> 128             PredictedClass = spd_filter.predict(X) # Predict spam or automated accounts/tweets:
    129             X['PredictedClass'] = PredictedClass # include the predicted class in the dataframe
    130             nonspam = df0.loc[X.PredictedClass.values==1] # sort out the nonspam accounts

~\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in predict(self, X)
    543             The predicted classes.
    544         """
--> 545         proba = self.predict_proba(X)
    546 
    547         if self.n_outputs_ == 1:

~\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in predict_proba(self, X)
    586         check_is_fitted(self, 'estimators_')
    587         # Check data
--> 588         X = self._validate_X_predict(X)
    589 
    590         # Assign chunk of trees to jobs

~\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in _validate_X_predict(self, X)
    357                                  "call `fit` before exploiting the model.")
    358 
--> 359         return self.estimators_[0]._validate_X_predict(X, check_input=True)
    360 
    361     @property

~\Anaconda3\lib\site-packages\sklearn\tree\tree.py in _validate_X_predict(self, X, check_input)
    389         """Validate X whenever one tries to predict, apply, predict_proba"""
    390         if check_input:
--> 391             X = check_array(X, dtype=DTYPE, accept_sparse="csr")
    392             if issparse(X) and (X.indices.dtype != np.intc or
    393                                 X.indptr.dtype != np.intc):

~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    548                              " minimum of %d is required%s."
    549                              % (n_samples, array.shape, ensure_min_samples,
--> 550                                 context))
    551 
    552     if ensure_min_features > 0 and array.ndim == 2:

ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required.

分享到QQ

分享到微博