无法使用 Python urllib2 加载 ASP.NET 页面
我正在尝试向 https://www. paoilandgasreporting.state.pa.us/publicreports/Modules/WellDetails/WellDetails.aspx 以抓取数据。
这是我当前的代码:
from urllib import urlencode
import urllib2
# Configuration
uri = 'https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/WellDetails/WellDetails.aspx'
headers = {
'HTTP_USER_AGENT': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.13) Gecko/2009073022 Firefox/3.0.13',
'HTTP_ACCEPT': 'application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Content-Type': 'application/x-www-form-urlencoded'
}
formFields = [(r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber','003-00013'),
(r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ob_CboSearchPermitNumberTB','003-00013'),
(r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ob_CboSearchPermitNumberSIS','0'),
(r'ctl00%24MainContent%24WellDetailsCriteria1%24ViewDataButton','View Data'),
(r'__VIEWSTATE', r'/wEPDwUJOTc2MzI0NTk4D2QWAmYPDxYEHglQYWdlVGl0bGUFDFdlbGwgRGV0YWlscx4SUmVxdWlyZXNKYXZhU2NyaXB0Z2QWAgIDD2QWCGYPFgIeBFRleHQF1hA8ZGl2IHN0eWxlPSJoZWlnaHQ6IDE0OXB4OyB3aWR0aDogOTUycHg7IGJhY2tncm91bmQtcmVwZWF0OiBuby1yZXBlYXQ7IGJhY2tncm91bmQtaW1hZ2U6dXJsKGh0dHBzOi8vd3d3LmFoczIuZGVwLnN0YXRlLnBhLnVzL2ltYWdlcy9kZXBfZXh0ZXJuYWxfb ... YWRlciRIZWFkZXJWaWV3D2dkrp784OTosLLEOFxy/mWBtsit I6kjKRlZ/ 1IBCkZNk='),
(r'__EVENTVALIDATION', r'/wEWBALn79faCwK+qZJIAqXY04cBAorCkdMKL5VEAnd1IIQ3cnIHRxZAluFo5G5Y5ffyRXRdtmBiGCc='),
(r'__EVENTTARGET', r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber'),
(r'__EVENTARGUMENT', r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber')
]
# Load page
encodedFields = urlencode(formFields)
req = urllib2.Request(uri, encodedFields, headers)
r = urllib2.urlopen(req)
# Handle results
print r.read()
返回的页面显示“抱歉,我们遇到技术困难。请稍后重试您的请求”,所以我知道我一定搞砸了。我不会发送 cookie,但我不确定这是否有必要。如果是,我可以将“Cookie:ASP.NET_SessionId=whatever”添加到我的标头中还是需要使用 CookieLib?
任何关于出了什么问题的想法将不胜感激!
编辑: 这是直接从页面提取 __VIEWSTATE 和 __EVENTVALIDATION 信息的代码的更新版本(因此我不需要复制和粘贴它或担心它已过期)
from urllib import urlencode
import urllib2
from BeautifulSoup import BeautifulSoup
import cookielib
# Configuration
uri = 'https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/WellDetails/WellDetails.aspx'
# Create headers
headers = {
'Accept': 'application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Host': 'www.paoilandgasreporting.state.pa.us',
'Origin': 'https://www.paoilandgasreporting.state.pa.us',
'Referer': 'https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/WellDetails/WellDetails.aspx',
'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.134 Safari/534.16',
}
# Set up cookie jar
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj), urllib2.HTTPSHandler(debuglevel=1))
# Grab information that we need to pass along with our requests
#r = urllib2.urlopen(uri)
req = urllib2.Request(uri,urlencode([]),headers)
cj.add_cookie_header(req)
r = opener.open(req)
print cj
soup = BeautifulSoup(r.read())
eventvalidation = soup.find('input', id='__EVENTVALIDATION')['value']
viewstate = soup.find('input', id='__VIEWSTATE')['value']
formFields = [ ('__EVENTVALIDATION',eventvalidation),
('__VIEWSTATE',viewstate),
('__EVENTTARGET', ''),
('__EVENTARGUMENT', ''),
('ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber', '003-00013'),
('ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ob_CboSearchPermitNumberTB','003-00013'),
('ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ob_CboSearchPermitNumberSIS','0'), # TODO what value to pass?
('ctl00$MainContent$WellDetailsCriteria1$ViewDataButton','View Data'), # do we need this?
]
# Load page
encodedFields = urlencode(formFields)
req = urllib2.Request(uri, encodedFields, headers)
cj.add_cookie_header(req)
r = opener.open(req)
# Handle results
print r.read()
I am trying to do a POST request to https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/WellDetails/WellDetails.aspx in order to scrape data.
Here is my current code:
from urllib import urlencode
import urllib2
# Configuration
uri = 'https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/WellDetails/WellDetails.aspx'
headers = {
'HTTP_USER_AGENT': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.13) Gecko/2009073022 Firefox/3.0.13',
'HTTP_ACCEPT': 'application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Content-Type': 'application/x-www-form-urlencoded'
}
formFields = [(r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber','003-00013'),
(r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ob_CboSearchPermitNumberTB','003-00013'),
(r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ob_CboSearchPermitNumberSIS','0'),
(r'ctl00%24MainContent%24WellDetailsCriteria1%24ViewDataButton','View Data'),
(r'__VIEWSTATE', r'/wEPDwUJOTc2MzI0NTk4D2QWAmYPDxYEHglQYWdlVGl0bGUFDFdlbGwgRGV0YWlscx4SUmVxdWlyZXNKYXZhU2NyaXB0Z2QWAgIDD2QWCGYPFgIeBFRleHQF1hA8ZGl2IHN0eWxlPSJoZWlnaHQ6IDE0OXB4OyB3aWR0aDogOTUycHg7IGJhY2tncm91bmQtcmVwZWF0OiBuby1yZXBlYXQ7IGJhY2tncm91bmQtaW1hZ2U6dXJsKGh0dHBzOi8vd3d3LmFoczIuZGVwLnN0YXRlLnBhLnVzL2ltYWdlcy9kZXBfZXh0ZXJuYWxfb ... YWRlciRIZWFkZXJWaWV3D2dkrp784OTosLLEOFxy/mWBtsit I6kjKRlZ/ 1IBCkZNk='),
(r'__EVENTVALIDATION', r'/wEWBALn79faCwK+qZJIAqXY04cBAorCkdMKL5VEAnd1IIQ3cnIHRxZAluFo5G5Y5ffyRXRdtmBiGCc='),
(r'__EVENTTARGET', r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber'),
(r'__EVENTARGUMENT', r'ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber')
]
# Load page
encodedFields = urlencode(formFields)
req = urllib2.Request(uri, encodedFields, headers)
r = urllib2.urlopen(req)
# Handle results
print r.read()
The page that is returned says "Sorry, we are having technical difficulties. Please try your request again later" so I know I must be messing something up. I am not sending a cookie, but I wasn't sure if this was necessary. If it is, can I just add "Cookie:ASP.NET_SessionId=whatever" to my headers or do I need to use CookieLib?
Any thoughts on what is going wrong would be most appreciated!
EDIT:
Here is an updated version of the code that pulls the __VIEWSTATE and __EVENTVALIDATION information from the page directly (so I don't need to copy and paste it or worry about it having expired)
from urllib import urlencode
import urllib2
from BeautifulSoup import BeautifulSoup
import cookielib
# Configuration
uri = 'https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/WellDetails/WellDetails.aspx'
# Create headers
headers = {
'Accept': 'application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Host': 'www.paoilandgasreporting.state.pa.us',
'Origin': 'https://www.paoilandgasreporting.state.pa.us',
'Referer': 'https://www.paoilandgasreporting.state.pa.us/publicreports/Modules/WellDetails/WellDetails.aspx',
'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.134 Safari/534.16',
}
# Set up cookie jar
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj), urllib2.HTTPSHandler(debuglevel=1))
# Grab information that we need to pass along with our requests
#r = urllib2.urlopen(uri)
req = urllib2.Request(uri,urlencode([]),headers)
cj.add_cookie_header(req)
r = opener.open(req)
print cj
soup = BeautifulSoup(r.read())
eventvalidation = soup.find('input', id='__EVENTVALIDATION')['value']
viewstate = soup.find('input', id='__VIEWSTATE')['value']
formFields = [ ('__EVENTVALIDATION',eventvalidation),
('__VIEWSTATE',viewstate),
('__EVENTTARGET', ''),
('__EVENTARGUMENT', ''),
('ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber', '003-00013'),
('ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ob_CboSearchPermitNumberTB','003-00013'),
('ctl00$MainContent$WellDetailsCriteria1$SearchPermitNumber$ob_CboSearchPermitNumberSIS','0'), # TODO what value to pass?
('ctl00$MainContent$WellDetailsCriteria1$ViewDataButton','View Data'), # do we need this?
]
# Load page
encodedFields = urlencode(formFields)
req = urllib2.Request(uri, encodedFields, headers)
cj.add_cookie_header(req)
r = opener.open(req)
# Handle results
print r.read()
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(2)
我也有同样的问题。我找到的唯一解决方案(我猜这不是最好的解决方案)是解析数据“当first_visit为True:”并在提交带有这些值的表单之后。
之后:
我在Python方面是个新手,但这对我有用......所以如果它可以帮助..!
I had the same problem. The only solution I've found (which is not the best I guess) is to parse the data "while first_visit is True:" and after submit your form with these values.
And after :
I am quite a n00b in Python but that worked for me ... so if it can help .. !
您一定不能提交他们期望的数据,这会产生该错误。
您可以准确地找出浏览器提交的内容,然后将其复制到脚本中。有各种 Firefox 扩展可以帮助您执行此操作,例如 TamperData< /a>、Firebug 和 LiveHttp。
然而,最简单的选择可能是使用 mechanize。
You must not be submitting the data they are expecting, which is generating that error.
You could find out exactly what your browser submits and then replicating this in your script. There are various firefox extensions that will help you do this, such as TamperData, Firebug, and LiveHttp.
However probably your easiest option would be to use mechanize.