Windows 上尝试 python 多处理编码时出现运行时错误

发布于 2025-01-10 11:38:14 字数 2843 浏览 0 评论 0原文

我正在尝试运行 python 多处理库来加速 csv 文件的编码。但是我遇到了这个错误：

RuntimeError: 
        An attempt has been made to start a new process before the
        current process has finished its bootstrapping phase.

        This probably means that you are not using fork to start your
        child processes and you have forgotten to use the proper idiom
        in the main module:

            if __name__ == '__main__':
                freeze_support()
                ...

        The "freeze_support()" line can be omitted if the program
        is not going to be frozen to produce an executable.

我确实创建了

Search = SemanticSearch(model_path, data_path, query)
if __name__ == '__main__':

    query, flat, top_results = Search.search()

That 指向我的类中的函数，

def setup(self):
        with open(self.data_path, newline='') as f:  # read and sort data
            reader = csv.reader(f)
            data1 = list(reader)
        self.corpus = [x for sublist in data1 for x in sublist]  # turn into 1D list
        #SemanticSearch.encode(self)
        self.texts_encodings = self.map(self.encode, self.corpus)
        end = time.time()
        print(end - self.start)


    def encode(self):
        self.start = time.time()
        return self.model.encode(self.corpus, convert_to_tensor=True, show_progress_bar=True)

在我的 init 函数中我确实调用并设置了

self.map = Pool().map

我有任何提示丢失的？提前致谢

编辑

class SemanticSearch(object):
   def __init__(self, model, data, query):
       self.query = query
       self.model = SentenceTransformer(model)  ### Model location
       self.data_path = data  ###path to csv 
       self.corpus = None
       self.texts_encodings = None
       self.start = None
       self.map = Pool().map

   def setup(self):
       print('here')
       with open(self.data_path, newline='') as f:  # read and sort data
           reader = csv.reader(f)
           data1 = list(reader)
       self.corpus = [x for sublist in data1 for x in sublist]  # turn into 1D list
       # SemanticSearch.encode(self)
       self.texts_encodings = self.map(self.encode, self.corpus)
       # SemanticSearch.encode(self)
       end = time.time()
       print(end - self.start)

   def encode(self):
       self.start = time.time()
       return self.model.encode(self.corpus, convert_to_tensor=True,
                                show_progress_bar=True)  ##encode to invisible layer

   def search(self):
       SemanticSearch.setup(self)


if __name__ == "__main__":
   model_path = r'data\BERT_MODELS\fine-tuned\multi-qa-MiniLM-L6-cos-v1'
   data_path = 'data/raw_data/Jira-2_14_2022.csv'
   query = 'query'

   Search = SemanticSearch(model_path, data_path, query)

   query, flat, top_results = Search.search()

原文

I'm trying to run the python multiprocessing library to speed up encoding of csv file. However I run into this error:

RuntimeError: 
        An attempt has been made to start a new process before the
        current process has finished its bootstrapping phase.

        This probably means that you are not using fork to start your
        child processes and you have forgotten to use the proper idiom
        in the main module:

            if __name__ == '__main__':
                freeze_support()
                ...

        The "freeze_support()" line can be omitted if the program
        is not going to be frozen to produce an executable.

I did create

Search = SemanticSearch(model_path, data_path, query)
if __name__ == '__main__':

    query, flat, top_results = Search.search()

That points to the function in my class,

def setup(self):
        with open(self.data_path, newline='') as f:  # read and sort data
            reader = csv.reader(f)
            data1 = list(reader)
        self.corpus = [x for sublist in data1 for x in sublist]  # turn into 1D list
        #SemanticSearch.encode(self)
        self.texts_encodings = self.map(self.encode, self.corpus)
        end = time.time()
        print(end - self.start)


    def encode(self):
        self.start = time.time()
        return self.model.encode(self.corpus, convert_to_tensor=True, show_progress_bar=True)

In my init function I did call and set

self.map = Pool().map

Any tips something I'm missing? Thanks in advance

EDIT

class SemanticSearch(object):
   def __init__(self, model, data, query):
       self.query = query
       self.model = SentenceTransformer(model)  ### Model location
       self.data_path = data  ###path to csv 
       self.corpus = None
       self.texts_encodings = None
       self.start = None
       self.map = Pool().map

   def setup(self):
       print('here')
       with open(self.data_path, newline='') as f:  # read and sort data
           reader = csv.reader(f)
           data1 = list(reader)
       self.corpus = [x for sublist in data1 for x in sublist]  # turn into 1D list
       # SemanticSearch.encode(self)
       self.texts_encodings = self.map(self.encode, self.corpus)
       # SemanticSearch.encode(self)
       end = time.time()
       print(end - self.start)

   def encode(self):
       self.start = time.time()
       return self.model.encode(self.corpus, convert_to_tensor=True,
                                show_progress_bar=True)  ##encode to invisible layer

   def search(self):
       SemanticSearch.setup(self)


if __name__ == "__main__":
   model_path = r'data\BERT_MODELS\fine-tuned\multi-qa-MiniLM-L6-cos-v1'
   data_path = 'data/raw_data/Jira-2_14_2022.csv'
   query = 'query'

   Search = SemanticSearch(model_path, data_path, query)

   query, flat, top_results = Search.search()

分享到QQ

分享到微博