Python网络爬虫的文件存储问题

发布于 2024-11-30 06:53:33 字数 3497 浏览 0 评论 0原文

我使用网络爬虫进行屏幕抓取数据并将结果（来自 Twitter 页面的推文）存储为我正在抓取的每个用户的单独 html 文件。我打算稍后解析html文件并将数据存储到数据库中进行分析。但是，我遇到了一个奇怪的问题。

当我运行以下程序时 - 来自整个爬虫的一个小片段 - 我能够为每个关注者获取单独的 html 文件：

import re
import urllib2
import twitter

start_follower = "NYTimesKrugman"
depth = 3

searched = set()

api = twitter.Api()

def crawl(follower, in_depth):
    if in_depth > 0:
        searched.add(follower)
        directory = "C:\\Python28\\Followertest1\\" + follower + ".html"
        output = open(directory, 'a')
        output.write(follower)
        output.write('\n\n')
        users = api.GetFriends(follower)
        names = set([str(u.screen_name) for u in users])
        names -= searched
        for name in list(names)[0:5]:
            crawl(name, in_depth-1) 

crawl(start_follower, depth)

for x in searched:
    print x
print "Program is completed."

但是，当我运行完整的爬虫时，我不会为每个关注者获取单独的文件：

import twitter
import urllib
from BeautifulSoup import BeautifulSoup
import re
import time

start_follower = "NYTimeskrugman" 
depth = 2
searched = set()

api = twitter.Api()


def add_to_U(user):
    U.append(user)

def site(follower): #creates a twitter site url in string format based on the follower username
    followersite = "http://mobile.twitter.com/" + follower
    return followersite

def getPage(follower): #obtains access to a webapge
    url = site(follower)
    response = urllib.urlopen(url)
    return response

def getSoup(response): #creates the parsing module
    html = response.read()
    soup = BeautifulSoup(html)
    return soup

def gettweets(soup, output):
    tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
    for tag in tags: 
        a = tag.renderContents()
        b = str (a)
        output.write(b)
        output.write('\n\n')

def are_more_tweets(soup):#to check whether there is more than one page on mobile twitter 
    links = soup.findAll('a', {'href': True}, {id: 'more_link'})
    for link in links:
        b = link.renderContents()
        test_b = str(b)
        if test_b.find('more') != -1:
            return True
    return False

def getnewlink(soup): #to get the link to go to the next page of tweets on twitter 
    links = soup.findAll('a', {'href': True}, {id : 'more_link'})
    for link in links:
        b = link.renderContents()
        if str(b) == 'more':
            c = link['href']
            d = 'http://mobile.twitter.com' +c
            return d

def crawl(follower, in_depth): #main method of sorts
    if in_depth > 0:
        searched.add(follower)
        directory = "C:\\Python28\\Followertest2\\" + follower + ".html"
        output = open(directory, 'a')
        output.write(follower)
        output.write('\n\n')
        a = getPage(follower)
        soup = getSoup(a)
        gettweets(soup, output)
        tweets = are_more_tweets(soup)
        while(tweets): 
            b = getnewlink(soup)
            red = urllib.urlopen(b)
            html = red.read()
            soup = BeautifulSoup(html)
            gettweets(soup, output)
            tweets = are_more_tweets(soup)
        users = api.GetFriends(follower)
        names = set([str(u.screen_name) for u in users])
        names -= searched
        for name in list(names)[0:5]:
            print name
            crawl(name, in_depth - 1)

crawl(start_follower, depth)
print("Program done. Look at output file.")

更具体地说，我似乎为前五个关注者获得了一个单独的 html 文件，然后似乎没有创建新文件。任何帮助将不胜感激！

原文

I am screen scraping data using a web crawler and storing the results - (tweets from a twitter page) as separate html files for each user I'm crawling. I intend to later parse the html files and store the data into a database for analysis. However, I am having a bizarre problem.

When I run the following program - a small snippet from the overall crawler - I am able to get a separate html file for each follower:

import re
import urllib2
import twitter

start_follower = "NYTimesKrugman"
depth = 3

searched = set()

api = twitter.Api()

def crawl(follower, in_depth):
    if in_depth > 0:
        searched.add(follower)
        directory = "C:\\Python28\\Followertest1\\" + follower + ".html"
        output = open(directory, 'a')
        output.write(follower)
        output.write('\n\n')
        users = api.GetFriends(follower)
        names = set([str(u.screen_name) for u in users])
        names -= searched
        for name in list(names)[0:5]:
            crawl(name, in_depth-1) 

crawl(start_follower, depth)

for x in searched:
    print x
print "Program is completed."

However, when I run the full crawler, I do not get a separate file for each follower:

import twitter
import urllib
from BeautifulSoup import BeautifulSoup
import re
import time

start_follower = "NYTimeskrugman" 
depth = 2
searched = set()

api = twitter.Api()


def add_to_U(user):
    U.append(user)

def site(follower): #creates a twitter site url in string format based on the follower username
    followersite = "http://mobile.twitter.com/" + follower
    return followersite

def getPage(follower): #obtains access to a webapge
    url = site(follower)
    response = urllib.urlopen(url)
    return response

def getSoup(response): #creates the parsing module
    html = response.read()
    soup = BeautifulSoup(html)
    return soup

def gettweets(soup, output):
    tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
    for tag in tags: 
        a = tag.renderContents()
        b = str (a)
        output.write(b)
        output.write('\n\n')

def are_more_tweets(soup):#to check whether there is more than one page on mobile twitter 
    links = soup.findAll('a', {'href': True}, {id: 'more_link'})
    for link in links:
        b = link.renderContents()
        test_b = str(b)
        if test_b.find('more') != -1:
            return True
    return False

def getnewlink(soup): #to get the link to go to the next page of tweets on twitter 
    links = soup.findAll('a', {'href': True}, {id : 'more_link'})
    for link in links:
        b = link.renderContents()
        if str(b) == 'more':
            c = link['href']
            d = 'http://mobile.twitter.com' +c
            return d

def crawl(follower, in_depth): #main method of sorts
    if in_depth > 0:
        searched.add(follower)
        directory = "C:\\Python28\\Followertest2\\" + follower + ".html"
        output = open(directory, 'a')
        output.write(follower)
        output.write('\n\n')
        a = getPage(follower)
        soup = getSoup(a)
        gettweets(soup, output)
        tweets = are_more_tweets(soup)
        while(tweets): 
            b = getnewlink(soup)
            red = urllib.urlopen(b)
            html = red.read()
            soup = BeautifulSoup(html)
            gettweets(soup, output)
            tweets = are_more_tweets(soup)
        users = api.GetFriends(follower)
        names = set([str(u.screen_name) for u in users])
        names -= searched
        for name in list(names)[0:5]:
            print name
            crawl(name, in_depth - 1)

crawl(start_follower, depth)
print("Program done. Look at output file.")

More specifically, I seem to get a separate html file for about the first five followers and then no new files appear to be created. Any help would be appreciated!

分享到QQ

分享到微博