Python-维基百科自动下载器

发布于 2024-10-21 05:56:38 字数 361 浏览 11 评论 0原文

[使用Python 3.1] 有谁知道如何使Python 3应用程序允许用户编写一个文本文件，其中包含多个用逗号分隔的单词。该程序应该读取该文件，并下载所请求项目的维基百科页面。例如，如果他们输入 hello,python-3,chicken ，它将转到维基百科并下载 http://www.wikipedia .com/wiki/hello、http://www.wikip...任何人都认为他们可以做到这一点？

当我说“下载”时，我的意思是下载文本，与图像无关。

原文

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

剑心龙吟 2024-10-28 05:56:38

查找 urllib.request。

回复收藏 0 原文

鸠魁 2024-10-28 05:56:38

您准确地描述了如何制作这样一个程序。那么问题是什么？

您阅读该文件，用逗号分隔，然后下载 URL。完毕！

回复收藏 0 原文

风渺 2024-10-28 05:56:38

检查下面的代码，它下载 html，没有图像，但您可以从正在解析的 xml 文件中访问它们以获取 url。

from time import sleep
import urllib
import urllib2
from xml.dom import minidom, Node

def main():
    print "Hello World"

    keywords = []

    key_file = open("example.txt", 'r')
    if key_file:
        temp_lines = key_file.readlines()

        for keyword_line in temp_lines:
            keywords.append(keyword_line.rstrip("\n"))

        key_file.close()

    print "Total keywords: %d" % len(keywords)
    for keyword in keywords:
        url = "http://en.wikipedia.org/w/api.php?format=xml&action=opensearch&search=" + keyword
        xmldoc = minidom.parse(urllib.urlopen(url))
        root_node = xmldoc.childNodes[0]

        section_node = None
        for node in root_node.childNodes:
            if node.nodeType == Node.ELEMENT_NODE and \
            node.nodeName == "Section":
                section_node = node
                break

        if section_node is not None:
            items = []
            for node in section_node.childNodes:
                if node.nodeType == Node.ELEMENT_NODE and \
                node.nodeName == "Item":
                    items.append(node)

            if len(items) == 0:
                print "NO results found"
            else:
                print "\nResults found for " + keyword + ":\n"
                for item in items:
                    for node in item.childNodes:
                        if node.nodeType == Node.ELEMENT_NODE and \
                        node.nodeName == "Text":
                            if len(node.childNodes) == 1:
                                print node.childNodes[0].data.encode('utf-8')

                file_name = None
                for node in items[0].childNodes:
                    if node.nodeType == Node.ELEMENT_NODE and \
                    node.nodeName == "Text":
                        if len(node.childNodes) == 1:
                            file_name = "Html\%s.html" % node.childNodes[0].data.encode('utf-8')
                            break

                if file_name is not None:
                    file = open(file_name, 'w')
                    if file:
                        for node in items[0].childNodes:
                            if node.nodeType == Node.ELEMENT_NODE and \
                            node.nodeName == "Url":
                                if len(node.childNodes) == 1:
                                    user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)'
                                    header = { 'User-Agent' : user_agent }
                                    request = urllib2.Request(url=node.childNodes[0].data, headers=header)
                                    file.write(urllib2.urlopen(request).read())
                                    file.close()
                                    break


    print "Sleeping"
    sleep(2)

if __name__ == "__main__":
    main()

Check the following code, it downloads the html, without the images, but you can access them from the xml file that is being parsed to get the url.

from time import sleep
import urllib
import urllib2
from xml.dom import minidom, Node

def main():
    print "Hello World"

    keywords = []

    key_file = open("example.txt", 'r')
    if key_file:
        temp_lines = key_file.readlines()

        for keyword_line in temp_lines:
            keywords.append(keyword_line.rstrip("\n"))

        key_file.close()

    print "Total keywords: %d" % len(keywords)
    for keyword in keywords:
        url = "http://en.wikipedia.org/w/api.php?format=xml&action=opensearch&search=" + keyword
        xmldoc = minidom.parse(urllib.urlopen(url))
        root_node = xmldoc.childNodes[0]

        section_node = None
        for node in root_node.childNodes:
            if node.nodeType == Node.ELEMENT_NODE and \
            node.nodeName == "Section":
                section_node = node
                break

        if section_node is not None:
            items = []
            for node in section_node.childNodes:
                if node.nodeType == Node.ELEMENT_NODE and \
                node.nodeName == "Item":
                    items.append(node)

            if len(items) == 0:
                print "NO results found"
            else:
                print "\nResults found for " + keyword + ":\n"
                for item in items:
                    for node in item.childNodes:
                        if node.nodeType == Node.ELEMENT_NODE and \
                        node.nodeName == "Text":
                            if len(node.childNodes) == 1:
                                print node.childNodes[0].data.encode('utf-8')

                file_name = None
                for node in items[0].childNodes:
                    if node.nodeType == Node.ELEMENT_NODE and \
                    node.nodeName == "Text":
                        if len(node.childNodes) == 1:
                            file_name = "Html\%s.html" % node.childNodes[0].data.encode('utf-8')
                            break

                if file_name is not None:
                    file = open(file_name, 'w')
                    if file:
                        for node in items[0].childNodes:
                            if node.nodeType == Node.ELEMENT_NODE and \
                            node.nodeName == "Url":
                                if len(node.childNodes) == 1:
                                    user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)'
                                    header = { 'User-Agent' : user_agent }
                                    request = urllib2.Request(url=node.childNodes[0].data, headers=header)
                                    file.write(urllib2.urlopen(request).read())
                                    file.close()
                                    break


    print "Sleeping"
    sleep(2)

if __name__ == "__main__":
    main()

回复收藏 0 原文

~没有更多了~