屏幕抓取 ASP.NET 页面无法正常工作

发布于 11-28 00:03 字数 3378 浏览 1 评论 0原文

我正在尝试在以下站点的页面上恢复日历事件: http://www.wphospital.org/News-Events/Calendar-of-Events.aspx

请注意,该网站有一个名为“Month”的链接 - 我需要能够发布请求特定日历事件的数据 月。我无法让它发挥作用。这是代码:

private static void GetData(ref string buf)
{
    try
    {
        //First, request the search form to get the viewstate value 
        HttpWebRequest webRequest = default(HttpWebRequest);
        webRequest = (HttpWebRequest)System.Net.WebRequest.Create("http://www.wphospital.org/News-Events/Calendar-of-Events.aspx");
        StreamReader responseReader = new StreamReader(webRequest.GetResponse().GetResponseStream());
        string responseData = responseReader.ReadToEnd();
        responseReader.Close();

        //Extract the viewstate value and build out POST data 
        string viewState = ExtractViewState(responseData);
        string eventValidation = ExtractEventValidation(responseData);
        string postData = null;

        postData = String.Format("ctl00$manScript={0}&__EVENTTARGET=&__EVENTARGUMENT&__LASTFOCUS=&__VIEWSTATE={1}&lng={2}&__EVENTVALIDATION={3}&ctl00$searchbox1$txtWord={4}&textfield2={5}&ctl00$plcMain$lstbxCategory={6}&ctl00$plcMain$lstbxSubCategory={7}", "ctl00$plcMain$updMonthNav|ctl00$plcMain$btnNextMonth", viewState, "en-US", eventValidation, "Search", "your search here", 0, 0);

        var encoding = new ASCIIEncoding();
        byte[] data = encoding.GetBytes(postData);

        //Now post to the search form 
        webRequest = (HttpWebRequest)System.Net.WebRequest.Create("http://www.wphospital.org/News-Events/Calendar-of-Events.aspx");
        webRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)";
        webRequest.Method = "POST";
        webRequest.ContentType = "application/x-www-form-urlencoded";
        webRequest.ContentLength = data.Length;

        var newStream = webRequest.GetRequestStream();
        newStream.Write(data, 0, data.Length);
        newStream.Close();

        responseReader = new StreamReader(webRequest.GetResponse().GetResponseStream());

        //And read the response 
        responseData = responseReader.ReadToEnd();
        responseReader.Close();
        buf = responseData;
    }
    catch (WebException ex)
    {
        if (ex.Status == WebExceptionStatus.ProtocolError)
        {
            Console.Write("The server returned protocol error ");
            // Get HttpWebResponse so that you can check the HTTP status code.
            HttpWebResponse httpResponse = (HttpWebResponse)ex.Response;
            int sc = (int)httpResponse.StatusCode;
            string strsc = httpResponse.StatusCode.ToString();
        }
    }
}

private static string ExtractViewState(string s)
{
    string viewStateNameDelimiter = "__VIEWSTATE";
    string valueDelimiter = "value=\"";

    int viewStateNamePosition = s.IndexOf(viewStateNameDelimiter);
    int viewStateValuePosition = s.IndexOf(valueDelimiter, viewStateNamePosition);

    int viewStateStartPosition = viewStateValuePosition + valueDelimiter.Length;
    int viewStateEndPosition = s.IndexOf("\"", viewStateStartPosition);

    return HttpUtility.UrlEncodeUnicode(s.Substring(viewStateStartPosition, viewStateEndPosition - viewStateStartPosition));
}

任何人都可以指出我正确的方向吗?

I am trying to bring back the calendar events on the page at the following site: http://www.wphospital.org/News-Events/Calendar-of-Events.aspx

Notice that this site has a link called "Month" - I need to be able POST data requesting calendar events for a particular month. I cannot get this to work. Here is the code:

private static void GetData(ref string buf)
{
    try
    {
        //First, request the search form to get the viewstate value 
        HttpWebRequest webRequest = default(HttpWebRequest);
        webRequest = (HttpWebRequest)System.Net.WebRequest.Create("http://www.wphospital.org/News-Events/Calendar-of-Events.aspx");
        StreamReader responseReader = new StreamReader(webRequest.GetResponse().GetResponseStream());
        string responseData = responseReader.ReadToEnd();
        responseReader.Close();

        //Extract the viewstate value and build out POST data 
        string viewState = ExtractViewState(responseData);
        string eventValidation = ExtractEventValidation(responseData);
        string postData = null;

        postData = String.Format("ctl00$manScript={0}&__EVENTTARGET=&__EVENTARGUMENT&__LASTFOCUS=&__VIEWSTATE={1}&lng={2}&__EVENTVALIDATION={3}&ctl00$searchbox1$txtWord={4}&textfield2={5}&ctl00$plcMain$lstbxCategory={6}&ctl00$plcMain$lstbxSubCategory={7}", "ctl00$plcMain$updMonthNav|ctl00$plcMain$btnNextMonth", viewState, "en-US", eventValidation, "Search", "your search here", 0, 0);

        var encoding = new ASCIIEncoding();
        byte[] data = encoding.GetBytes(postData);

        //Now post to the search form 
        webRequest = (HttpWebRequest)System.Net.WebRequest.Create("http://www.wphospital.org/News-Events/Calendar-of-Events.aspx");
        webRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)";
        webRequest.Method = "POST";
        webRequest.ContentType = "application/x-www-form-urlencoded";
        webRequest.ContentLength = data.Length;

        var newStream = webRequest.GetRequestStream();
        newStream.Write(data, 0, data.Length);
        newStream.Close();

        responseReader = new StreamReader(webRequest.GetResponse().GetResponseStream());

        //And read the response 
        responseData = responseReader.ReadToEnd();
        responseReader.Close();
        buf = responseData;
    }
    catch (WebException ex)
    {
        if (ex.Status == WebExceptionStatus.ProtocolError)
        {
            Console.Write("The server returned protocol error ");
            // Get HttpWebResponse so that you can check the HTTP status code.
            HttpWebResponse httpResponse = (HttpWebResponse)ex.Response;
            int sc = (int)httpResponse.StatusCode;
            string strsc = httpResponse.StatusCode.ToString();
        }
    }
}

private static string ExtractViewState(string s)
{
    string viewStateNameDelimiter = "__VIEWSTATE";
    string valueDelimiter = "value=\"";

    int viewStateNamePosition = s.IndexOf(viewStateNameDelimiter);
    int viewStateValuePosition = s.IndexOf(valueDelimiter, viewStateNamePosition);

    int viewStateStartPosition = viewStateValuePosition + valueDelimiter.Length;
    int viewStateEndPosition = s.IndexOf("\"", viewStateStartPosition);

    return HttpUtility.UrlEncodeUnicode(s.Substring(viewStateStartPosition, viewStateEndPosition - viewStateStartPosition));
}

Can anyone point me in the right direction?

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(3

朕就是辣么酷2024-12-05 00:03:51

这可能会也可能不会解决您的问题,因为当您说它不起作用时,我不知道问题到底是什么。但正如“Al W”指出的那样 - 来自异步回发的响应不会看起来像直接的 HTML 流。因此,如果您的问题是事后解析它,那么这可能会有所帮助。

我最近有“机会”发现这一点,因为我需要重写该输出。我正在开发 C# jQuery 端口,发现当我尝试重新启动时,我正在破坏 WebForms 页面在异步回发期间呈现输出流。我浏览了解析响应的客户端脚本并找出了响应的格式。

每个更新的面板都会返回一个数据块,其格式如下:

“长度|类型|ID|内容”

可以有任意数量的这些数据串在一起。 UpdatePanels 的类型为“updatePanel”。 ID 是控件的 UniqueID,Content 是实际的 HTML 数据。长度等于 Content 中的字节数,您需要使用它来解析每个块,因为分隔符可能出现在 Content 本身内部。因此,如果您决定在将其发送回 ASP.NET 页面之前重写此数据(就像我所做的那样),您需要更新 Length 以反映内容的最终长度。

我用来解析和重写它的代码位于 Server/CsQueryHttpContext

This may or may not solve your problem because I don't know exactly what the problem is when you say it's not working. But as "Al W" noted - the response from an async postback is not going to look like a straight HTML stream. So if your problem is parsing it afterwards then this might help.

I had the "opportunity" to discover this recently because I needed to rewrite that output. I'm working on a C# jQuery port and found that I was breaking WebForms pages when I tried to re-render the output stream during an async postback. I went through the client script that parses the response and figured out the format of the response.

Each panel that is updated will return a block of data that is formatted like:

"Length|Type|ID|Content"

There could be any number of these strung together. Type is "updatePanel" for UpdatePanels. ID is the UniqueID of the control, and Content is the actual HTML data. Length is equal to the number of bytes in Content, and you need to use that to parse each block, because the separator character may appear inside Content itself. So if you decided you wanted to rewrite this data before sending it back to an ASP.NET page (like I did) you need to update Length to reflect the final length of your content.

The code I used to parse and rewrite it is in Server/CsQueryHttpContext.

夜访吸血鬼2024-12-05 00:03:51

对于 POST 操作,您希望它采用 UTF-8 编码,因此只需重新执行一行

        //var encoding = new ASCIIEncoding();
        //byte[] data = encoding.GetBytes(postData);
        //do this instead.....
        byte[] data = Encoding.UTF8.GetBytes(postData);

,看看这是否对您有帮助

For POST operations, you want it to be UTF-8 encoded, so just re-do the one line

        //var encoding = new ASCIIEncoding();
        //byte[] data = encoding.GetBytes(postData);
        //do this instead.....
        byte[] data = Encoding.UTF8.GetBytes(postData);

and see if this helps you out

你爱我像她2024-12-05 00:03:51

下面是我在 Chrome 中单击“每月”按钮时得到的网络跟踪。请注意 __EVENTTARGET:ctl00$plcMain$monthBtn asp.net 中有一个 javascript 框架,当单击该链接时,该框架会调用 javascript:postback() 方法,从而设置事件目标。这基本上就是 ASP.NET Webform 知道在回发时触发哪个事件的方式。这里一件棘手的事情是网页正在使用更新面板,因此您可能无法获得真正的 html 响应。如果您的请求看起来像这样,那么您应该会得到成功的响应。希望这有帮助。

Request URL:http://www.wphospital.org/News-Events/Calendar-of-Events.aspx
Request Method:POST
Status Code:200 OK
Request Headers
Accept-Charset:ISO-8859-1,utf-8;q=0.7,*;q=0.3
Accept-Encoding:gzip,deflate,sdch
Accept-Language:en-US,en;q=0.8
Cache-Control:no-cache
Content-Length:9718
Content-Type:application/x-www-form-urlencoded
Cookie:CMSPreferredCulture=en-US; ASP.NET_SessionId=h2nval45vq0q5yb0cp233huc; __utma=101137351.234148951.1312486481.1312486481.1312486481.1; __utmb=101137351.1.10.1312486481; __utmc=101137351; __utmz=101137351.1312486481.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __unam=ef169fe-131964a5f2a-24ec879b-1
Host:www.wphospital.org
Origin:http://www.wphospital.org
Proxy-Connection:keep-alive
Referer:http://www.wphospital.org/News-Events/Calendar-of-Events.aspx
User-Agent:Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.124 Safari/534.30
X-MicrosoftAjax:Delta=true
Form Dataview URL encoded
ctl00$manScript:ctl00$plcMain$updTab|ctl00$plcMain$monthBtn
__EVENTTARGET:ctl00$plcMain$monthBtn
__EVENTARGUMENT:
__LASTFOCUS:
__VIEWSTATE:<removed for brevity>
lng:en-US
__EVENTVALIDATION:/wEWEgLbj/nSDgKt983zDgKWlOLbAQKr3LqFAwKL3uqpBwK9kfRnArDHltMCAuTk0eAHAsfniK0DAteIosMPAsiIosMPAsmIosMPAsuIosMPAoD0ookDApCbiOcPAo biOcPAombiOcPAoubiOcPyfqRx8FdqYzlnnkXcJEJZzzopJY=
ctl00$searchbox1$txtWord:Search
textfield2:Enter your search here
ctl00$plcMain$lstbxCategory:0
ctl00$plcMain$lstbxSubCategory:0
ctl00$plcMain$hdnEventCount:2

Below is the network trace I get in chrome when clicking the monthly button. Notice the __EVENTTARGET:ctl00$plcMain$monthBtn asp.net has a javascript framework in there that is calling a javascript:postback() method when that link is clicked, which sets the event target. That's basically how ASP.NET webforms knows which event to fire on a postback. One tricky thing here is that the web page is using an update panel so you might not get a true html response. If you can get your request to look something like this, then you should get a successful response. Hope this helps.

Request URL:http://www.wphospital.org/News-Events/Calendar-of-Events.aspx
Request Method:POST
Status Code:200 OK
Request Headers
Accept-Charset:ISO-8859-1,utf-8;q=0.7,*;q=0.3
Accept-Encoding:gzip,deflate,sdch
Accept-Language:en-US,en;q=0.8
Cache-Control:no-cache
Content-Length:9718
Content-Type:application/x-www-form-urlencoded
Cookie:CMSPreferredCulture=en-US; ASP.NET_SessionId=h2nval45vq0q5yb0cp233huc; __utma=101137351.234148951.1312486481.1312486481.1312486481.1; __utmb=101137351.1.10.1312486481; __utmc=101137351; __utmz=101137351.1312486481.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __unam=ef169fe-131964a5f2a-24ec879b-1
Host:www.wphospital.org
Origin:http://www.wphospital.org
Proxy-Connection:keep-alive
Referer:http://www.wphospital.org/News-Events/Calendar-of-Events.aspx
User-Agent:Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.124 Safari/534.30
X-MicrosoftAjax:Delta=true
Form Dataview URL encoded
ctl00$manScript:ctl00$plcMain$updTab|ctl00$plcMain$monthBtn
__EVENTTARGET:ctl00$plcMain$monthBtn
__EVENTARGUMENT:
__LASTFOCUS:
__VIEWSTATE:<removed for brevity>
lng:en-US
__EVENTVALIDATION:/wEWEgLbj/nSDgKt983zDgKWlOLbAQKr3LqFAwKL3uqpBwK9kfRnArDHltMCAuTk0eAHAsfniK0DAteIosMPAsiIosMPAsmIosMPAsuIosMPAoD0ookDApCbiOcPAo biOcPAombiOcPAoubiOcPyfqRx8FdqYzlnnkXcJEJZzzopJY=
ctl00$searchbox1$txtWord:Search
textfield2:Enter your search here
ctl00$plcMain$lstbxCategory:0
ctl00$plcMain$lstbxSubCategory:0
ctl00$plcMain$hdnEventCount:2
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文