如何按一个元素对JSON文件进行排序并从中删除重复项?

发布于 2025-01-20 13:54:56 字数 1725 浏览 2 评论 0原文

我们假设使用 scrapy (python) 将 Billboard 100 艺术家提取到一个 json 文件中,并抓取每个页面中的前 5 个艺术家,并按字母顺序对它们进行排序并删除重复项。然后,将它们加载到新的谷歌工作表中。这是我到目前为止所做的:

import json
import scrapy
import datetime
from datetime import datetime
from datetime import timedelta, date


class BillboardWeeklySpider(scrapy.Spider):
    name = 'billboard-weekly'
    allowed_domains = ['www.billboard.com']
    start_urls = ['https://www.billboard.com/charts/artist-100/']

    def __init__(self):
        self.last_week_str = ""

    def parse(self, response):
        for i in range(4):
            string_date = response.css('#chart-date-picker::attr(data-date)').get()
            real_date = datetime.strptime(string_date, '%Y-%m-%d')
            day_delta = timedelta(weeks=1)
            last_week = real_date - i * day_delta
            self.last_week_str = last_week.strftime('%Y-%m-%d')
            next_page = f"https://www.billboard.com/charts/artist-100/{self.last_week_str}"
            if next_page:
                yield response.follow(next_page, callback=self.week_parse)

    def week_parse(self, response):

        for element in response.css('.o-chart-results-list-row-container'):
            name = element.css('#title-of-a-story::text').get()
            number = element.css(
                'span.c-label.a-font-primary-bold-l.u-font-size-32\@tablet.u-letter-spacing-0080\@tablet::text').get()
            clean_name = name.strip()
            clean_number = number.strip()
            if int(clean_number) > 5:
                break
            yield {
                'name': clean_name,
                'rank': clean_number,
                'date': response.url
            }

we suppose to extract the billboard 100 artists into a json file with scrapy (python) and also grab the 5-first ones in each pages and sort them alphabetically and remove the duplicate. and then, load them into a new google sheet. This is what I've done so far:

import json
import scrapy
import datetime
from datetime import datetime
from datetime import timedelta, date


class BillboardWeeklySpider(scrapy.Spider):
    name = 'billboard-weekly'
    allowed_domains = ['www.billboard.com']
    start_urls = ['https://www.billboard.com/charts/artist-100/']

    def __init__(self):
        self.last_week_str = ""

    def parse(self, response):
        for i in range(4):
            string_date = response.css('#chart-date-picker::attr(data-date)').get()
            real_date = datetime.strptime(string_date, '%Y-%m-%d')
            day_delta = timedelta(weeks=1)
            last_week = real_date - i * day_delta
            self.last_week_str = last_week.strftime('%Y-%m-%d')
            next_page = f"https://www.billboard.com/charts/artist-100/{self.last_week_str}"
            if next_page:
                yield response.follow(next_page, callback=self.week_parse)

    def week_parse(self, response):

        for element in response.css('.o-chart-results-list-row-container'):
            name = element.css('#title-of-a-story::text').get()
            number = element.css(
                'span.c-label.a-font-primary-bold-l.u-font-size-32\@tablet.u-letter-spacing-0080\@tablet::text').get()
            clean_name = name.strip()
            clean_number = number.strip()
            if int(clean_number) > 5:
                break
            yield {
                'name': clean_name,
                'rank': clean_number,
                'date': response.url
            }

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

橘亓 2025-01-27 13:54:56
import scrapy
import json
import datetime
from datetime import datetime
from datetime import timedelta, date


class QuestionSpider(scrapy.Spider):
    global f
    name = 'billboard-weekly'
    allowed_domains = ['www.billboard.com']
    start_urls = ['https://www.billboard.com/charts/artist-100/']
    f = open("billboard.json","w")
    def __init__(self):
        self.last_week_str = ""

    def parse(self, response):
        for i in range(4):
            string_date = response.css('#chart-date-picker::attr(data-date)').get()
            real_date = datetime.strptime(string_date, '%Y-%m-%d')
            day_delta = timedelta(weeks=1)
            last_week = real_date - i * day_delta
            self.last_week_str = last_week.strftime('%Y-%m-%d')
            next_page = f"https://www.billboard.com/charts/artist-100/{self.last_week_str}"
            if next_page:
                yield response.follow(next_page, callback=self.week_parse)

    def week_parse(self, response):
        global f
        item = {}
        for element in response.css('.o-chart-results-list-row-container'):
            name = element.css('#title-of-a-story::text').get()
            number = element.css(
                'span.c-label.a-font-primary-bold-l.u-font-size-32\@tablet.u-letter-spacing-0080\@tablet::text').get()
            clean_name = name.strip()
            clean_number = number.strip()
            if int(clean_number) > 5:
                break
            item["name"] = clean_name
            item["rank"] = clean_number
            item["date"] = response.url
            jsonl = json.dumps(item)
            f.write(jsonl)
            print(item)
import scrapy
import json
import datetime
from datetime import datetime
from datetime import timedelta, date


class QuestionSpider(scrapy.Spider):
    global f
    name = 'billboard-weekly'
    allowed_domains = ['www.billboard.com']
    start_urls = ['https://www.billboard.com/charts/artist-100/']
    f = open("billboard.json","w")
    def __init__(self):
        self.last_week_str = ""

    def parse(self, response):
        for i in range(4):
            string_date = response.css('#chart-date-picker::attr(data-date)').get()
            real_date = datetime.strptime(string_date, '%Y-%m-%d')
            day_delta = timedelta(weeks=1)
            last_week = real_date - i * day_delta
            self.last_week_str = last_week.strftime('%Y-%m-%d')
            next_page = f"https://www.billboard.com/charts/artist-100/{self.last_week_str}"
            if next_page:
                yield response.follow(next_page, callback=self.week_parse)

    def week_parse(self, response):
        global f
        item = {}
        for element in response.css('.o-chart-results-list-row-container'):
            name = element.css('#title-of-a-story::text').get()
            number = element.css(
                'span.c-label.a-font-primary-bold-l.u-font-size-32\@tablet.u-letter-spacing-0080\@tablet::text').get()
            clean_name = name.strip()
            clean_number = number.strip()
            if int(clean_number) > 5:
                break
            item["name"] = clean_name
            item["rank"] = clean_number
            item["date"] = response.url
            jsonl = json.dumps(item)
            f.write(jsonl)
            print(item)
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文