Python砂纸我可以获取任何数据

发布于 2025-02-09 02:25:24 字数 1798 浏览 2 评论 0原文

from urllib import parse
import scrapy
from scrapy.linkextractors import LinkExtractor
import codecs
import json

class WanikaniSpider(scrapy.Spider):
    name = 'japandict'
    allowed_domains = ['www.wanikani.com']         
    url = ('https://www.wanikani.com/kanji/')
    start_urls = []
    kanjis = ['悪' ,'安' ,'以' ,'飲' ,'意' ,'医' ,'員' ,'運' ,'映' ,'英' ,'屋' ,'駅' ,'歌' ,'牛']
    liste=[]
    for kanji in kanjis:
        liste.append(kanji)
        nurl = url + kanji
        start_urls.append(nurl)
    file =  open("kanji.txt","a",encoding="utf-8")
    file1 = open("onyomi.txt","a",encoding="utf-8")
    file2 = open("kunyomi.txt","a",encoding="utf-8") 
    file3 = open("meanings.txt","a",encoding="utf-8")       
           
           
    def parse(self, response):
        print(response.url)
        kanjiicon = response.xpath('//*[@id="main"]/body/div[1]/div[3]/div/div/header/h1/span/text()').getall()
        meanings = response.xpath('//*[@id="main"]/body/div[1]/div[3]/div/div/header/h1/text()').getall()
        reading = response.xpath('//*[@id="reading"]/div') 
        for onkun in reading:
            onyomi= onkun.xpath('//*[@id="reading"]/div/div[1]/p/text()').getall()
            kunyomi= onkun.xpath('//*[@id="reading"]/div/div[2]/p/text()').getall()                
        for x in onyomi:
            x.strip()
            self.file1.write(x + "\n")
            self.file1.close
        for y in kanjiicon:
            self.file.write(y + "\n")
            self.file.close
        for z in kunyomi:
            self.file2.write(z + "\n")
            self.file.close
        for p in meanings:
            self.file3.write(p + "\n")
            self.file.close

汉字是日本角色,具有Onyomi和Kunyomi读物。我想获得汉字的读数和含义,并在文本文件上写入。所以有网站我可以这样做。它创建txt文件,但它是空的。

from urllib import parse
import scrapy
from scrapy.linkextractors import LinkExtractor
import codecs
import json

class WanikaniSpider(scrapy.Spider):
    name = 'japandict'
    allowed_domains = ['www.wanikani.com']         
    url = ('https://www.wanikani.com/kanji/')
    start_urls = []
    kanjis = ['悪' ,'安' ,'以' ,'飲' ,'意' ,'医' ,'員' ,'運' ,'映' ,'英' ,'屋' ,'駅' ,'歌' ,'牛']
    liste=[]
    for kanji in kanjis:
        liste.append(kanji)
        nurl = url + kanji
        start_urls.append(nurl)
    file =  open("kanji.txt","a",encoding="utf-8")
    file1 = open("onyomi.txt","a",encoding="utf-8")
    file2 = open("kunyomi.txt","a",encoding="utf-8") 
    file3 = open("meanings.txt","a",encoding="utf-8")       
           
           
    def parse(self, response):
        print(response.url)
        kanjiicon = response.xpath('//*[@id="main"]/body/div[1]/div[3]/div/div/header/h1/span/text()').getall()
        meanings = response.xpath('//*[@id="main"]/body/div[1]/div[3]/div/div/header/h1/text()').getall()
        reading = response.xpath('//*[@id="reading"]/div') 
        for onkun in reading:
            onyomi= onkun.xpath('//*[@id="reading"]/div/div[1]/p/text()').getall()
            kunyomi= onkun.xpath('//*[@id="reading"]/div/div[2]/p/text()').getall()                
        for x in onyomi:
            x.strip()
            self.file1.write(x + "\n")
            self.file1.close
        for y in kanjiicon:
            self.file.write(y + "\n")
            self.file.close
        for z in kunyomi:
            self.file2.write(z + "\n")
            self.file.close
        for p in meanings:
            self.file3.write(p + "\n")
            self.file.close

Kanji is Japanese character which has a onyomi and kunyomi readings. I wanna get this readings and meaning of kanji and write on text file. So there is website I can do this. Its creating txt file but its empty.

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

牵你的手,一向走下去 2025-02-16 02:25:24

我看到您的代码有一些问题。我不确定这是否是使您的项目工作所需的一切,但是一个主要问题是您如何打开和关闭文件。现在,您将它们在类别定义中打开,然后通过每个请求将它们关闭。这意味着在第一次解析之后被称为您的文件已经关闭,并且不再可写。您应该做的是使用废品项目管道将输出和写入数据将数据引向文件。例如:

在您的蜘蛛文件中:

import scrapy

class WanikaniSpider(scrapy.Spider):
    name = 'japandict'
    allowed_domains = ['www.wanikani.com']
    url = ('https://www.wanikani.com/kanji/')
    start_urls = []
    kanjis = ['悪' ,'安' ,'以' ,'飲' ,'意' ,'医' ,'員' ,'運' ,'映' ,'英' ,'屋' ,'駅' ,'歌' ,'牛']
    liste=[]
    for kanji in kanjis:
        liste.append(kanji)
        nurl = url + kanji
        start_urls.append(nurl)

    def parse(self, response):
        kanjiicon = response.xpath('//*[@id="main"]/body/div[1]/div[3]/div/div/header/h1/span/text()').getall()
        meanings = response.xpath('//*[@id="main"]/body/div[1]/div[3]/div/div/header/h1/text()').getall()
        for y in kanjiicon:
            yield {"kanji": y.strip()}
        for p in meanings:
            yield {"meanings": p.strip()}
        reading = response.xpath('//*[@id="reading"]/div')
        for onkun in reading:
            onyomi= onkun.xpath('//*[@id="reading"]/div/div[1]/p/text()').getall()
            kunyomi= onkun.xpath('//*[@id="reading"]/div/div[2]/p/text()').getall()
            for x in onyomi:
                yield {"onyomi": x.strip()}
            for z in kunyomi:
                yield {"kunyomi": z.strip()}

然后在您的pipelines.py文件中

class SpidersPipeline:
    def process_item(self, item, spider):
        for i, kw in enumerate(["kanji","onyomi","kunyomi","meanings"]):
            if kw in item:
                self.files[i].write(item[kw] + "\n")

    def open_spider(self, spider):
        self.files = [open(x, "a", encoding="utf-8") for x in [
                      "kanji.txt", "onyomi.txt", "kunyomi.txt", 
                      "meanings.txt"]]

    def close_spider(self, spider):
        list(map(lambda x: x.close(), self.files))

,请记住在settings.py文件中删除管道。

ITEM_PIPELINES = {
   'spiders.pipelines.SpidersPipeline': 300,   # <- make sure it is uncommented
}

I see a few issues with your code. I am not certain if this is all that is needed to make your project work but one main issue is with how you are opening and closing the files. Right now you open them in your Class definition and then close them with each and every request. Which means that after the very first time parse is called your files have already been closed and are no longer writable. What you should do is use scrapy item pipelines for directing output and writing data to files. for example:

in your spider file:

import scrapy

class WanikaniSpider(scrapy.Spider):
    name = 'japandict'
    allowed_domains = ['www.wanikani.com']
    url = ('https://www.wanikani.com/kanji/')
    start_urls = []
    kanjis = ['悪' ,'安' ,'以' ,'飲' ,'意' ,'医' ,'員' ,'運' ,'映' ,'英' ,'屋' ,'駅' ,'歌' ,'牛']
    liste=[]
    for kanji in kanjis:
        liste.append(kanji)
        nurl = url + kanji
        start_urls.append(nurl)

    def parse(self, response):
        kanjiicon = response.xpath('//*[@id="main"]/body/div[1]/div[3]/div/div/header/h1/span/text()').getall()
        meanings = response.xpath('//*[@id="main"]/body/div[1]/div[3]/div/div/header/h1/text()').getall()
        for y in kanjiicon:
            yield {"kanji": y.strip()}
        for p in meanings:
            yield {"meanings": p.strip()}
        reading = response.xpath('//*[@id="reading"]/div')
        for onkun in reading:
            onyomi= onkun.xpath('//*[@id="reading"]/div/div[1]/p/text()').getall()
            kunyomi= onkun.xpath('//*[@id="reading"]/div/div[2]/p/text()').getall()
            for x in onyomi:
                yield {"onyomi": x.strip()}
            for z in kunyomi:
                yield {"kunyomi": z.strip()}

then in your pipelines.py file

class SpidersPipeline:
    def process_item(self, item, spider):
        for i, kw in enumerate(["kanji","onyomi","kunyomi","meanings"]):
            if kw in item:
                self.files[i].write(item[kw] + "\n")

    def open_spider(self, spider):
        self.files = [open(x, "a", encoding="utf-8") for x in [
                      "kanji.txt", "onyomi.txt", "kunyomi.txt", 
                      "meanings.txt"]]

    def close_spider(self, spider):
        list(map(lambda x: x.close(), self.files))

and remember to uncomment the pipelines in the settings.py file

ITEM_PIPELINES = {
   'spiders.pipelines.SpidersPipeline': 300,   # <- make sure it is uncommented
}
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文