python web刮擦rightmove
我有一个称为“数据”的数据集,看起来像这样:
postcode location_id url_name
NE30-2BG 1159502 https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=POSTCODE^1159502
我正在使用下面的代码从上面的数据中获取URL并从RightMove检索属性详细信息。我希望能够在下面的输出旁边输出邮政编码(从数据)输出。当代码所处的情况下,我无法将从RightMovesCraper检索到的数据链接到原始邮政编码。 任何想法都赞赏!
class RightmoveScraper:
results = []
def fetch(self, url):
print('HTTP GET request to URL: %s' % url, end ='')
response = requests.get(url)
print(' | Status code: %s' % response.status_code)
return response
def parse(self, html):
content = BeautifulSoup(html, 'html.parser') #lxml
titles = [title.text.strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
bedrooms = [title.text.split('bedroom')[0].strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
addresses = [address['content'] for address in content.findAll('meta', {'itemprop': 'streetAddress'})]
descriptions = [description.text for description in content.findAll('span', {'data-test': 'property-description'})]
prices = [price.text.strip() for price in content.findAll('div', {'class': 'propertyCard-priceValue'})]
under_over = [underover.text.strip() for underover in content.findAll('div', {'class': 'propertyCard-priceQualifier'})]
#code1 = [price.text.strip() for price in content.findAll('type', {'hidden': 'value'})]
dates = [date.text for date in content.findAll('span', {'class': 'propertyCard-branchSummary-addedOrReduced'})]
sellers = [seller.text.split('by')[-1].strip() for seller in content.findAll('span',{'class': 'propertyCard-branchSummary-branchName'})]
for index in range(0, len(titles)):
self.results.append({
'title': titles[index],
'no_of_bedrooms' : bedrooms[index],
'address': addresses[index],
'description': descriptions[index],
'price': prices[index],
'under_over': under_over[index],
#'code1': code1[index],
'date': dates[index],
'seller': sellers[index]})
def to_csv(self):
with open('rightmove_data.csv','w') as csv_file:
writer = csv.DictWriter(csv_file,fieldnames=self.results[0].keys())
writer.writeheader()
for row in self.results:
writer.writerow(row)
print('Stored results to "rightmove_data.csv"')
def run(self):
for url_name in data['url_name']:
#postcode = data['postcode']
url = url_name
response = self.fetch(url)
self.parse(response.text)
self.to_csv()
if __name__ == '__main__':
scraper = RightmoveScraper()
scraper.run()
I have a dataset called "data" that looks like this:
postcode location_id url_name
NE30-2BG 1159502 https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=POSTCODE^1159502
I'm using the code below to take the url from the data above and retrieve property details from Rightmove. I want to be able to output the postcode (from data) alongside the output below. As the code stands, I'm unable to link the data retrieved from my RightmoveScraper to the original postcode.
Any ideas appreciated!
class RightmoveScraper:
results = []
def fetch(self, url):
print('HTTP GET request to URL: %s' % url, end ='')
response = requests.get(url)
print(' | Status code: %s' % response.status_code)
return response
def parse(self, html):
content = BeautifulSoup(html, 'html.parser') #lxml
titles = [title.text.strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
bedrooms = [title.text.split('bedroom')[0].strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
addresses = [address['content'] for address in content.findAll('meta', {'itemprop': 'streetAddress'})]
descriptions = [description.text for description in content.findAll('span', {'data-test': 'property-description'})]
prices = [price.text.strip() for price in content.findAll('div', {'class': 'propertyCard-priceValue'})]
under_over = [underover.text.strip() for underover in content.findAll('div', {'class': 'propertyCard-priceQualifier'})]
#code1 = [price.text.strip() for price in content.findAll('type', {'hidden': 'value'})]
dates = [date.text for date in content.findAll('span', {'class': 'propertyCard-branchSummary-addedOrReduced'})]
sellers = [seller.text.split('by')[-1].strip() for seller in content.findAll('span',{'class': 'propertyCard-branchSummary-branchName'})]
for index in range(0, len(titles)):
self.results.append({
'title': titles[index],
'no_of_bedrooms' : bedrooms[index],
'address': addresses[index],
'description': descriptions[index],
'price': prices[index],
'under_over': under_over[index],
#'code1': code1[index],
'date': dates[index],
'seller': sellers[index]})
def to_csv(self):
with open('rightmove_data.csv','w') as csv_file:
writer = csv.DictWriter(csv_file,fieldnames=self.results[0].keys())
writer.writeheader()
for row in self.results:
writer.writerow(row)
print('Stored results to "rightmove_data.csv"')
def run(self):
for url_name in data['url_name']:
#postcode = data['postcode']
url = url_name
response = self.fetch(url)
self.parse(response.text)
self.to_csv()
if __name__ == '__main__':
scraper = RightmoveScraper()
scraper.run()
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(2)
看来您使用
dataFrame
,因此可以使用.iterrows()
像这样的结果:
或者可以使用
.apply()
在所有行。It seems you use
DataFrame
so you could use.iterrows()
like thisResult:
Or you could use
.apply()
to execute function on all rows.感谢 furas ,这可以很好地享用!谢谢你!
With thanks to furas, this works a treat! Thank you!