requests爬取代理网站ip并验证，验证通过，但是登陆ip网站查询ip并未变化

发布于 2022-08-29 20:04:45 字数 3688 浏览 15 评论 0

改写了网上一个爬取ip代理并验证的代码，验证不是去登陆度娘，而是换了ip查询网站，原以为会用爬取的代理IP，实际上ip并没有变化，烦请大牛看下什么原因，附上运行结果：

#

{'https': u'183.221.50.139:8123'} 您的IP:[218.88.XX.XX] 来自:四川省成都市电信操作系统:Unknown浏览器:Unknown 0.0python-requests/2.4.0 CPython/2.7.3 Windows/7
{'https': u'116.236.216.116:8080'} 您的IP:[218.88.XX.XX] 来自:四川省成都市电信操作系统:Unknown浏览器:Unknown 0.0python-requests/2.4.0 CPython/2.7.3 Windows/7
{'https': u'183.221.160.44:8123'} 您的IP:[218.88.XX.XX] 来自:四川省成都市电信操作系统:Unknown浏览器:Unknown 0.0python-requests/2.4.0 CPython/2.7.3 Windows/7

#

代码如下

import requests
from lxml import etree
from bs4 import BeautifulSoup as bs
import Queue
import threading
import time
import datetime
import sys

reload(sys)
sys.setdefaultencoding('utf-8')

# write proxy
def writeproxy(porxyinfo):
    writefile = file('porxyinfo.txt','a+')
    writefile.write(porxyinfo)
    writefile.write('\n')  
    writefile.close()

# return page code
def GetPageText(url):
    r = requests.get(url)
    return r.text

# return post urllist
def GetPostUrl(source):
    posturllist = []

    iplist = bs(source).find("table",{"id":"ip_list"}).findAll("tr")[1:]
    for item in iplist:
        getinfo = item.findAll("td")
        ip      = getinfo[1].get_text(strip='\r\n')
        port    = getinfo[2].get_text(strip='\r\n')
        address = getinfo[3].get_text(strip='\r\n')
        type    = getinfo[5].get_text(strip='\r\n')
        posturllist.append(type.lower()+'#'+ip+':'+port)
    return posturllist

def Checkproxy(porxyinfo):
    proxies = {}
    if porxyinfo.split('#')[0] == 'http':
        proxies['http'] = porxyinfo.split('#')[1]
    else:
        proxies['https'] = porxyinfo.split('#')[1]
    r = requests.get("http://ip.chinaz.com/", proxies=proxies,timeout=3)
    if r:
        print proxies, bs(requests.get('http://ip.chinaz.com/').content).find("span",{"class":"info3"}).get_text(strip='\r\n')
#         writeproxy(porxyinfo)
    else:
        print 'No'
def getproxyid():
    start = time.time()
    queue = Queue.Queue()
    class ThreadUrl(threading.Thread):
        """Threaded Url Grab"""
        def __init__(self, queue):
            threading.Thread.__init__(self)
            self.queue = queue
            global mutex
        def run(self):
            while True:
                porxyinfo = self.queue.get()
                try:
                    mutex.acquire(5)
                    try:
                        Checkproxy(porxyinfo)
                    except:
                        time.sleep(0.15)
                        mutex.release()
                        self.queue.task_done()
                        continue
                    time.sleep(0.15)
                    mutex.release()

                    self.queue.task_done()
                except Exception,e:
                    time.sleep(0.15)
                    self.queue.task_done()       

    pagenum =5
    targets  = ['http://www.xici.net.co/nn/%d'%page for page in range(1,pagenum+1)]
    targets += ['http://www.xici.net.co/wn/%d'%page for page in range(1,pagenum+1)]     
    for proxyurl in targets:
        try:
            PageText = GetPageText(proxyurl)
        except Exception,e:
            print e
            break
        PostUrlList = GetPostUrl(PageText)

        mutex = threading.Lock()
        for i in range(5):
            t = ThreadUrl(queue)
            t.setDaemon(True)
            try:
                t.start()
            except:
                pass

        for host in PostUrlList:
            queue.put(host)
        queue.join()
    print "Elapsed Time: %s" % (time.time() - start)

if __name__ == '__main__':
    getproxyid()

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

神仙妹妹 2022-09-05 20:04:45

    r = requests.get("http://ip.chinaz.com/", proxies=proxies,timeout=3)
    if r:
        print proxies, bs(requests.get('http://ip.chinaz.com/').content).find("span",{"class":"info3"}).get_text(strip='\r\n')

你这是在干什么？？使用代理请求一次，如果成功，那么不使用代理再请求一次，并检查这一次的返回数据？你不应该使用 bs(r.content).... 么？

PS: 拿 curl 的 User-Agent 访问 http://ip.cn ，不用解析，直接出结果。

回复收藏 0

~没有更多了~