Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 2 years ago.
import requests, lxml, re, json from bs4 import BeautifulSoup headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582" } # works with different countries, languages params = { "q": "mcdonalds", "gl": "jp", "hl": "ja", # japanese } response = requests.get("https://www.google.com/search", headers=headers, params=params) soup = BeautifulSoup(response.text, 'lxml') local_results = [] for result in soup.select('.VkpGBb'): title = result.select_one('.dbg0pd span').text try: website = result.select_one('.yYlJEf.L48Cpd')['href'] except: website = None try: directions = f"https://www.google.com{result.select_one('.yYlJEf.VByer')['data-url']}" except: directions = None address_not_fixed = result.select_one('.lqhpac div').text # removes phone number from "address_not_fixed" variable # https://regex101.com/r/cwLdY8/1 address = re.sub(r' · ?.*', '', address_not_fixed) phone = ''.join(re.findall(r' · ?(.*)', address_not_fixed)) try: hours = result.select_one('.dXnVAb').previous_element except: hours = None try: options = result.select_one('.dXnVAb').text.split('·') except: options = None local_results.append({ 'title': title, 'phone': phone, 'address': address, 'hours': hours, 'options': options, 'website': website, 'directions': directions, }) print(json.dumps(local_results, indent=2, ensure_ascii=False))
# English results: { "title": "McDonald's", "phone": "(620) 251-3330", "address": "Coffeyville, KS", "hours": " ⋅ Opens 5AM", "options": [ "Curbside pickup", "Delivery" ], "website": "https://www.mcdonalds.com/us/en-us/location/KS/COFFEYVILLE/302-W-11TH/4581.html?cid=RF:YXT:GMB::Clicks", "directions": "https://www.google.com/maps/dir//McDonald's,+302+W+11th+St,+Coffeyville,+KS+67337/data=!4m6!4m5!1m1!4e2!1m2!1m1!1s0x87b784f6803e4c81:0xf5af9c9c89f19918?sa=X&hl=en&gl=us" }
Using just requests and bs4 is hard but possible. Not entirely sure what information you are trying to parse, but this should help you:
Here is the output that you will get back, hopefully this helps!:
文章 0 评论 0
Using just requests and bs4 is hard but possible. Not entirely sure what information you are trying to parse, but this should help you:
Here is the output that you will get back, hopefully this helps!: