Python-维基百科自动下载器

发布于 2024-10-21 05:56:38 字数 361 浏览 3 评论 0原文

[使用Python 3.1] 有谁知道如何使Python 3应用程序允许用户编写一个文本文件,其中包含多个用逗号分隔的单词。该程序应该读取该文件,并下载所请求项目的维基百科页面。例如,如果他们输入 hello,p​​ython-3,chicken ,它将转到维基百科并下载 http://www.wikipedia .com/wiki/hellohttp://www.wikip...任何人都认为他们可以做到这一点?

当我说“下载”时,我的意思是下载文本,与图像无关。

[Using Python 3.1] Does anyone have any idea how to make a Python 3 application allow the user to write a text file with multiple words separated with commas. The program should read the file, and download the Wikipedia page of the requested item. e.g. if they typed hello,python-3,chicken it would go to Wikipedia and download http://www.wikipedia.com/wiki/hello, http://www.wikip... Anyone think they can do this?

When I say "download" I mean download the text, doesn't matter about images.

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(3

鸠魁 2024-10-28 05:56:38

您准确地描述了如何制作这样一个程序。那么问题是什么?

您阅读该文件,用逗号分隔,然后下载 URL。完毕!

You described exactly how to make such a program. So what is the question?

You read the file, split on commas, and download the URL. Done!

风渺 2024-10-28 05:56:38

检查下面的代码,它下载 html,没有图像,但您可以从正在解析的 xml 文件中访问它们以获取 url。

from time import sleep
import urllib
import urllib2
from xml.dom import minidom, Node

def main():
    print "Hello World"

    keywords = []

    key_file = open("example.txt", 'r')
    if key_file:
        temp_lines = key_file.readlines()

        for keyword_line in temp_lines:
            keywords.append(keyword_line.rstrip("\n"))

        key_file.close()

    print "Total keywords: %d" % len(keywords)
    for keyword in keywords:
        url = "http://en.wikipedia.org/w/api.php?format=xml&action=opensearch&search=" + keyword
        xmldoc = minidom.parse(urllib.urlopen(url))
        root_node = xmldoc.childNodes[0]

        section_node = None
        for node in root_node.childNodes:
            if node.nodeType == Node.ELEMENT_NODE and \
            node.nodeName == "Section":
                section_node = node
                break

        if section_node is not None:
            items = []
            for node in section_node.childNodes:
                if node.nodeType == Node.ELEMENT_NODE and \
                node.nodeName == "Item":
                    items.append(node)

            if len(items) == 0:
                print "NO results found"
            else:
                print "\nResults found for " + keyword + ":\n"
                for item in items:
                    for node in item.childNodes:
                        if node.nodeType == Node.ELEMENT_NODE and \
                        node.nodeName == "Text":
                            if len(node.childNodes) == 1:
                                print node.childNodes[0].data.encode('utf-8')

                file_name = None
                for node in items[0].childNodes:
                    if node.nodeType == Node.ELEMENT_NODE and \
                    node.nodeName == "Text":
                        if len(node.childNodes) == 1:
                            file_name = "Html\%s.html" % node.childNodes[0].data.encode('utf-8')
                            break

                if file_name is not None:
                    file = open(file_name, 'w')
                    if file:
                        for node in items[0].childNodes:
                            if node.nodeType == Node.ELEMENT_NODE and \
                            node.nodeName == "Url":
                                if len(node.childNodes) == 1:
                                    user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)'
                                    header = { 'User-Agent' : user_agent }
                                    request = urllib2.Request(url=node.childNodes[0].data, headers=header)
                                    file.write(urllib2.urlopen(request).read())
                                    file.close()
                                    break


    print "Sleeping"
    sleep(2)

if __name__ == "__main__":
    main()

Check the following code, it downloads the html, without the images, but you can access them from the xml file that is being parsed to get the url.

from time import sleep
import urllib
import urllib2
from xml.dom import minidom, Node

def main():
    print "Hello World"

    keywords = []

    key_file = open("example.txt", 'r')
    if key_file:
        temp_lines = key_file.readlines()

        for keyword_line in temp_lines:
            keywords.append(keyword_line.rstrip("\n"))

        key_file.close()

    print "Total keywords: %d" % len(keywords)
    for keyword in keywords:
        url = "http://en.wikipedia.org/w/api.php?format=xml&action=opensearch&search=" + keyword
        xmldoc = minidom.parse(urllib.urlopen(url))
        root_node = xmldoc.childNodes[0]

        section_node = None
        for node in root_node.childNodes:
            if node.nodeType == Node.ELEMENT_NODE and \
            node.nodeName == "Section":
                section_node = node
                break

        if section_node is not None:
            items = []
            for node in section_node.childNodes:
                if node.nodeType == Node.ELEMENT_NODE and \
                node.nodeName == "Item":
                    items.append(node)

            if len(items) == 0:
                print "NO results found"
            else:
                print "\nResults found for " + keyword + ":\n"
                for item in items:
                    for node in item.childNodes:
                        if node.nodeType == Node.ELEMENT_NODE and \
                        node.nodeName == "Text":
                            if len(node.childNodes) == 1:
                                print node.childNodes[0].data.encode('utf-8')

                file_name = None
                for node in items[0].childNodes:
                    if node.nodeType == Node.ELEMENT_NODE and \
                    node.nodeName == "Text":
                        if len(node.childNodes) == 1:
                            file_name = "Html\%s.html" % node.childNodes[0].data.encode('utf-8')
                            break

                if file_name is not None:
                    file = open(file_name, 'w')
                    if file:
                        for node in items[0].childNodes:
                            if node.nodeType == Node.ELEMENT_NODE and \
                            node.nodeName == "Url":
                                if len(node.childNodes) == 1:
                                    user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)'
                                    header = { 'User-Agent' : user_agent }
                                    request = urllib2.Request(url=node.childNodes[0].data, headers=header)
                                    file.write(urllib2.urlopen(request).read())
                                    file.close()
                                    break


    print "Sleeping"
    sleep(2)

if __name__ == "__main__":
    main()
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文