python web刮擦rightmove

发布于 2025-02-10 18:24:43 字数 3038 浏览 2 评论 0原文

我有一个称为“数据”的数据集,看起来像这样:

postcode    location_id url_name
NE30-2BG    1159502     https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=POSTCODE^1159502

我正在使用下面的代码从上面的数据中获取URL并从RightMove检索属性详细信息。我希望能够在下面的输出旁边输出邮政编码(从数据)输出。当代码所处的情况下,我无法将从RightMovesCraper检索到的数据链接到原始邮政编码。 任何想法都赞赏!

class RightmoveScraper:
    results = []
    
    def fetch(self, url):
        print('HTTP GET request to URL: %s' % url, end ='')
        response = requests.get(url)
        print(' | Status code: %s' % response.status_code)
        
        return response
    
    def parse(self, html):
        content = BeautifulSoup(html, 'html.parser')   #lxml
        
        titles = [title.text.strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
        bedrooms = [title.text.split('bedroom')[0].strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
        addresses = [address['content'] for address in content.findAll('meta', {'itemprop': 'streetAddress'})]
        descriptions = [description.text for description in content.findAll('span', {'data-test': 'property-description'})]
        prices = [price.text.strip() for price in content.findAll('div', {'class': 'propertyCard-priceValue'})]
        under_over = [underover.text.strip() for underover in content.findAll('div', {'class': 'propertyCard-priceQualifier'})]
        #code1 = [price.text.strip() for price in content.findAll('type', {'hidden': 'value'})]
        dates = [date.text for date in content.findAll('span', {'class': 'propertyCard-branchSummary-addedOrReduced'})]
        sellers = [seller.text.split('by')[-1].strip() for seller in content.findAll('span',{'class': 'propertyCard-branchSummary-branchName'})]
        
        
        for index in range(0, len(titles)):
            self.results.append({
                'title': titles[index],
                'no_of_bedrooms' : bedrooms[index],
                'address': addresses[index],
                'description': descriptions[index],
                'price': prices[index],
                'under_over': under_over[index],
                #'code1': code1[index],
                'date': dates[index],
                'seller': sellers[index]})
            

    def to_csv(self):
        with open('rightmove_data.csv','w') as csv_file:
            writer = csv.DictWriter(csv_file,fieldnames=self.results[0].keys())
            writer.writeheader()
            
            for row in self.results:
                writer.writerow(row)
                
            print('Stored results to "rightmove_data.csv"')
            
                
    def run(self):
        for url_name in data['url_name']:
            #postcode = data['postcode']
            url = url_name
            response = self.fetch(url)
            self.parse(response.text)
            
        self.to_csv()
        
        
          
if __name__ == '__main__':
    scraper = RightmoveScraper() 
    scraper.run()

I have a dataset called "data" that looks like this:

postcode    location_id url_name
NE30-2BG    1159502     https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=POSTCODE^1159502

I'm using the code below to take the url from the data above and retrieve property details from Rightmove. I want to be able to output the postcode (from data) alongside the output below. As the code stands, I'm unable to link the data retrieved from my RightmoveScraper to the original postcode.
Any ideas appreciated!

class RightmoveScraper:
    results = []
    
    def fetch(self, url):
        print('HTTP GET request to URL: %s' % url, end ='')
        response = requests.get(url)
        print(' | Status code: %s' % response.status_code)
        
        return response
    
    def parse(self, html):
        content = BeautifulSoup(html, 'html.parser')   #lxml
        
        titles = [title.text.strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
        bedrooms = [title.text.split('bedroom')[0].strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
        addresses = [address['content'] for address in content.findAll('meta', {'itemprop': 'streetAddress'})]
        descriptions = [description.text for description in content.findAll('span', {'data-test': 'property-description'})]
        prices = [price.text.strip() for price in content.findAll('div', {'class': 'propertyCard-priceValue'})]
        under_over = [underover.text.strip() for underover in content.findAll('div', {'class': 'propertyCard-priceQualifier'})]
        #code1 = [price.text.strip() for price in content.findAll('type', {'hidden': 'value'})]
        dates = [date.text for date in content.findAll('span', {'class': 'propertyCard-branchSummary-addedOrReduced'})]
        sellers = [seller.text.split('by')[-1].strip() for seller in content.findAll('span',{'class': 'propertyCard-branchSummary-branchName'})]
        
        
        for index in range(0, len(titles)):
            self.results.append({
                'title': titles[index],
                'no_of_bedrooms' : bedrooms[index],
                'address': addresses[index],
                'description': descriptions[index],
                'price': prices[index],
                'under_over': under_over[index],
                #'code1': code1[index],
                'date': dates[index],
                'seller': sellers[index]})
            

    def to_csv(self):
        with open('rightmove_data.csv','w') as csv_file:
            writer = csv.DictWriter(csv_file,fieldnames=self.results[0].keys())
            writer.writeheader()
            
            for row in self.results:
                writer.writerow(row)
                
            print('Stored results to "rightmove_data.csv"')
            
                
    def run(self):
        for url_name in data['url_name']:
            #postcode = data['postcode']
            url = url_name
            response = self.fetch(url)
            self.parse(response.text)
            
        self.to_csv()
        
        
          
if __name__ == '__main__':
    scraper = RightmoveScraper() 
    scraper.run()

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(2

┼── 2025-02-17 18:24:43

看来您使用dataFrame,因此可以使用.iterrows()像这样的

import pandas as pd

data = {
    'postcode': ['A','B','C'], 
    'url_name': ['www1','www2','www3'], 
    'other':    ['X','Y','Z']
}

df = pd.DataFrame(data)

def run():
    for index, row in df.iterrows():
        print('index   :', index   
        print('postcode:', row['postcode'])
        print('url_name:', row['url_name'])
        print('other   :', row['other'])
        print('----')
        #response = self.fetch(row['url_name'])
        #self.parse(response.text, row['postcode'])

run()

结果:

index   : 0
postcode: A
url_name: www1
other   : X
----
index   : 1
postcode: B
url_name: www2
other   : Y
----
index   : 2
postcode: C
url_name: www3
other   : Z

或者可以使用.apply()在所有行。

import pandas as pd

def process(row):
    print('postcode:', row['postcode'])
    print('url_name:', row['url_name'])
    print('other   :', row['other'])
    print('----')
    #response = self.fetch(row['url_name'])
    #self.parse(response.text, row['postcode'])
    
data = {
    'postcode': ['A','B','C'], 
    'url_name': ['www1','www2','www3'], 
    'other':    ['X','Y','Z']
}

df = pd.DataFrame(data)

def run():
    df.apply(process, axis=1)

run()

It seems you use DataFrame so you could use .iterrows() like this

import pandas as pd

data = {
    'postcode': ['A','B','C'], 
    'url_name': ['www1','www2','www3'], 
    'other':    ['X','Y','Z']
}

df = pd.DataFrame(data)

def run():
    for index, row in df.iterrows():
        print('index   :', index   
        print('postcode:', row['postcode'])
        print('url_name:', row['url_name'])
        print('other   :', row['other'])
        print('----')
        #response = self.fetch(row['url_name'])
        #self.parse(response.text, row['postcode'])

run()

Result:

index   : 0
postcode: A
url_name: www1
other   : X
----
index   : 1
postcode: B
url_name: www2
other   : Y
----
index   : 2
postcode: C
url_name: www3
other   : Z

Or you could use .apply() to execute function on all rows.

import pandas as pd

def process(row):
    print('postcode:', row['postcode'])
    print('url_name:', row['url_name'])
    print('other   :', row['other'])
    print('----')
    #response = self.fetch(row['url_name'])
    #self.parse(response.text, row['postcode'])
    
data = {
    'postcode': ['A','B','C'], 
    'url_name': ['www1','www2','www3'], 
    'other':    ['X','Y','Z']
}

df = pd.DataFrame(data)

def run():
    df.apply(process, axis=1)

run()

紫﹏色ふ单纯 2025-02-17 18:24:43

感谢 furas ,这可以很好地享用!谢谢你!

 import pandas as pd
 import requests
 from bs4 import BeautifulSoup
 import csv

data = pd.read_csv('postcode data.csv')
df = pd.DataFrame(data)

class RightmoveScraper:
    results = []
    
    def fetch(self, url):
        print('HTTP GET request to URL: %s' % url, end ='')
        
        response = requests.get(url)
        print(' | Status code: %s' % response.status_code)
        
        return response
    

    def parse(self, html, pp):
        content = BeautifulSoup(html, 'html.parser')   #lxml
        
        titles = [title.text.strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
        bedrooms = [title.text.split('bedroom')[0].strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
        addresses = [address['content'] for address in content.findAll('meta', {'itemprop': 'streetAddress'})]
        descriptions = [description.text for description in content.findAll('span', {'data-test': 'property-description'})]
        prices = [price.text.strip() for price in content.findAll('div', {'class': 'propertyCard-priceValue'})]
        under_over = [underover.text.strip() for underover in content.findAll('div', {'class': 'propertyCard-priceQualifier'})]
        dates = [date.text for date in content.findAll('span', {'class': 'propertyCard-branchSummary-addedOrReduced'})]
        sellers = [seller.text.split('by')[-1].strip() for seller in content.findAll('span',{'class': 'propertyCard-branchSummary-branchName'})]
        
        
        for index in range(0, len(titles)):
            self.results.append({
                'postcode': pp,
                'title': titles[index],
                'no_of_bedrooms' : bedrooms[index],
                'address': addresses[index],
                'description': descriptions[index],
                'price': prices[index],
                'under_over': under_over[index],
                'date': dates[index],
                'seller': sellers[index]})           
            

    def to_csv(self):
        with open('output.csv','w') as csv_file:
            writer = csv.DictWriter(csv_file,fieldnames=self.results[0].keys())
            writer.writeheader()
            
            for row in self.results:
                writer.writerow(row)
                
            print('Stored results to "output.csv"')
            
               
    def run(self):
        for index, row in df.iterrows():
            pp = row['postcode']
            url = row['url_name']
            response = self.fetch(url)
            self.parse(response.text, pp)
    
                
        self.to_csv()

        
        
          
if __name__ == '__main__':
    scraper = RightmoveScraper() 
    scraper.run()

With thanks to furas, this works a treat! Thank you!

 import pandas as pd
 import requests
 from bs4 import BeautifulSoup
 import csv

data = pd.read_csv('postcode data.csv')
df = pd.DataFrame(data)

class RightmoveScraper:
    results = []
    
    def fetch(self, url):
        print('HTTP GET request to URL: %s' % url, end ='')
        
        response = requests.get(url)
        print(' | Status code: %s' % response.status_code)
        
        return response
    

    def parse(self, html, pp):
        content = BeautifulSoup(html, 'html.parser')   #lxml
        
        titles = [title.text.strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
        bedrooms = [title.text.split('bedroom')[0].strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
        addresses = [address['content'] for address in content.findAll('meta', {'itemprop': 'streetAddress'})]
        descriptions = [description.text for description in content.findAll('span', {'data-test': 'property-description'})]
        prices = [price.text.strip() for price in content.findAll('div', {'class': 'propertyCard-priceValue'})]
        under_over = [underover.text.strip() for underover in content.findAll('div', {'class': 'propertyCard-priceQualifier'})]
        dates = [date.text for date in content.findAll('span', {'class': 'propertyCard-branchSummary-addedOrReduced'})]
        sellers = [seller.text.split('by')[-1].strip() for seller in content.findAll('span',{'class': 'propertyCard-branchSummary-branchName'})]
        
        
        for index in range(0, len(titles)):
            self.results.append({
                'postcode': pp,
                'title': titles[index],
                'no_of_bedrooms' : bedrooms[index],
                'address': addresses[index],
                'description': descriptions[index],
                'price': prices[index],
                'under_over': under_over[index],
                'date': dates[index],
                'seller': sellers[index]})           
            

    def to_csv(self):
        with open('output.csv','w') as csv_file:
            writer = csv.DictWriter(csv_file,fieldnames=self.results[0].keys())
            writer.writeheader()
            
            for row in self.results:
                writer.writerow(row)
                
            print('Stored results to "output.csv"')
            
               
    def run(self):
        for index, row in df.iterrows():
            pp = row['postcode']
            url = row['url_name']
            response = self.fetch(url)
            self.parse(response.text, pp)
    
                
        self.to_csv()

        
        
          
if __name__ == '__main__':
    scraper = RightmoveScraper() 
    scraper.run()
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文