Nodejs Cheerio 获取所有文本
我有以下页面的 html 代码:
<div class="textwidget">
<p>
<strong>Monday-Tuesday (Feb 21-22)</strong>
<br />
<a href="https://dramaday.net/moonshine/">Moonshine</a> 15-16 (KBS2)<br />
<a href="https://dramaday.net/ghost-doctor/">Ghost Doctor</a> 15-16 (tvN)<br />
<strong>Wednesday-Thursday (Feb 23-24)</strong>
<br />
<a href="https://dramaday.net/rookie-cops/">Rookie Cops</a> 9-10
(Disney+)<br />
<a href="https://dramaday.net/grid/">Grid</a> 2 (Disney+)<br />
<a href="https://dramaday.net/thirty-nine/">Thirty Nine</a> 3-4 (JTBC)<br />
Kill Heel 1-2 (tvN)<br /> <!-- It is not taken -->
Sponsor 1-2 (MBN)<br /> <!-- It is not taken -->
<strong>Friday-Sunday (Feb 25-27)</strong>
<br />
Juvenile Justice 1-10 (Netflix)<br /> <!-- It is not taken -->
<a href="https://dramaday.net/through-the-darkness/"
>Through the Darkness</a
>
7-8 (SBS)<br />
<a href="https://dramaday.net/twenty-five-twenty-one/"
>Twenty Five Twenty One</a
>
5-6 (tvN)<br />
<a href="https://dramaday.net/forecasting-love-and-weather/"
>Forecasting Love and Weather</a
>
5-6 (JTBC)<br />
<a href="https://dramaday.net/young-lady-and-gentleman/"
>Young Lady and Gentleman</a
>
43-44 (KBS2)<br />
<a href="https://dramaday.net/love-ft-marriage-and-divorce/"
>Love (ft. Marriage and Divorce) S3</a
>
1-2 (CSTV)
</p>
</div>
我有以下 js 代码来从页面获取信息:
async function getData() {
let element = [];
try {
const { data } = await axios.get("https://dramaday.net");
const $ = cheerio.load(data);
let day = 0;
let date = ""
$("div.textwidget > p >").each((_idx, el) => {
const text = $(el).text();
if(text.includes("Monday-Tuesday")){
day = 0;
date = text.replace('(','').replace(')','').split(' ')
} else if(text.includes("Wednesday-Thursday")){
day = 1;
date = text.replace('(','').replace(')','').split(' ')
} else if(text.includes("Friday-Sunday")){
day = 2;
date = text.replace('(','').replace(')','').split(' ')
} else {
const obj = {
name: text,
day,
date
}
element.push(obj);
}
});
return element;
} catch (error) {
throw error;
}
}
问题是不包含标签的文本不会被获取。
例如,参见 html 代码,其中显示“未采用”。
我得到的结果是这样的,正如你所看到的,有很多空名称,我认为这是由于 br
标签造成的。
我可以通过将 else
替换为 else if
以及条件 文本来解决它! == ''
,但我没有解决文本不包含在任何标签中的问题。 除此之外,我无法获取 a
标签后面的文本(即剧集编号和频道)。
你能帮我个忙吗?
[
{ name: '', day: 0, date: [ 'Monday-Tuesday', 'Feb', '21-22' ] },
{
name: 'Moonshine',
day: 0,
date: [ 'Monday-Tuesday', 'Feb', '21-22' ]
},
{ name: '', day: 0, date: [ 'Monday-Tuesday', 'Feb', '21-22' ] },
{
name: 'Ghost Doctor',
day: 0,
date: [ 'Monday-Tuesday', 'Feb', '21-22' ]
},
{ name: '', day: 0, date: [ 'Monday-Tuesday', 'Feb', '21-22' ] },
{ name: '', day: 1, date: [ 'Wednesday-Thursday', 'Feb', '23-24' ] },
{
name: 'Rookie Cops',
day: 1,
date: [ 'Wednesday-Thursday', 'Feb', '23-24' ]
},
{ name: '', day: 1, date: [ 'Wednesday-Thursday', 'Feb', '23-24' ] },
{
name: 'Grid',
day: 1,
date: [ 'Wednesday-Thursday', 'Feb', '23-24' ]
},
{ name: '', day: 1, date: [ 'Wednesday-Thursday', 'Feb', '23-24' ] },
{
name: 'Thirty Nine',
day: 1,
date: [ 'Wednesday-Thursday', 'Feb', '23-24' ]
},
{ name: '', day: 1, date: [ 'Wednesday-Thursday', 'Feb', '23-24' ] },
{ name: '', day: 1, date: [ 'Wednesday-Thursday', 'Feb', '23-24' ] },
{ name: '', day: 1, date: [ 'Wednesday-Thursday', 'Feb', '23-24' ] },
{ name: '', day: 2, date: [ 'Friday-Sunday', 'Feb', '25-27' ] },
{ name: '', day: 2, date: [ 'Friday-Sunday', 'Feb', '25-27' ] },
{
name: 'Through the Darkness',
day: 2,
date: [ 'Friday-Sunday', 'Feb', '25-27' ]
},
{ name: '', day: 2, date: [ 'Friday-Sunday', 'Feb', '25-27' ] },
{
name: 'Twenty Five Twenty One',
day: 2,
date: [ 'Friday-Sunday', 'Feb', '25-27' ]
},
{ name: '', day: 2, date: [ 'Friday-Sunday', 'Feb', '25-27' ] },
{
name: 'Forecasting Love and Weather',
day: 2,
date: [ 'Friday-Sunday', 'Feb', '25-27' ]
},
{ name: '', day: 2, date: [ 'Friday-Sunday', 'Feb', '25-27' ] },
{
name: 'Young Lady and Gentleman',
day: 2,
date: [ 'Friday-Sunday', 'Feb', '25-27' ]
},
{ name: '', day: 2, date: [ 'Friday-Sunday', 'Feb', '25-27' ] },
{
name: 'Love (ft. Marriage and Divorce) S3',
day: 2,
date: [ 'Friday-Sunday', 'Feb', '25-27' ]
}
]
I have the following html code of the page:
<div class="textwidget">
<p>
<strong>Monday-Tuesday (Feb 21-22)</strong>
<br />
<a href="https://dramaday.net/moonshine/">Moonshine</a> 15-16 (KBS2)<br />
<a href="https://dramaday.net/ghost-doctor/">Ghost Doctor</a> 15-16 (tvN)<br />
<strong>Wednesday-Thursday (Feb 23-24)</strong>
<br />
<a href="https://dramaday.net/rookie-cops/">Rookie Cops</a> 9-10
(Disney+)<br />
<a href="https://dramaday.net/grid/">Grid</a> 2 (Disney+)<br />
<a href="https://dramaday.net/thirty-nine/">Thirty Nine</a> 3-4 (JTBC)<br />
Kill Heel 1-2 (tvN)<br /> <!-- It is not taken -->
Sponsor 1-2 (MBN)<br /> <!-- It is not taken -->
<strong>Friday-Sunday (Feb 25-27)</strong>
<br />
Juvenile Justice 1-10 (Netflix)<br /> <!-- It is not taken -->
<a href="https://dramaday.net/through-the-darkness/"
>Through the Darkness</a
>
7-8 (SBS)<br />
<a href="https://dramaday.net/twenty-five-twenty-one/"
>Twenty Five Twenty One</a
>
5-6 (tvN)<br />
<a href="https://dramaday.net/forecasting-love-and-weather/"
>Forecasting Love and Weather</a
>
5-6 (JTBC)<br />
<a href="https://dramaday.net/young-lady-and-gentleman/"
>Young Lady and Gentleman</a
>
43-44 (KBS2)<br />
<a href="https://dramaday.net/love-ft-marriage-and-divorce/"
>Love (ft. Marriage and Divorce) S3</a
>
1-2 (CSTV)
</p>
</div>
I have the following js code to get information from the page:
async function getData() {
let element = [];
try {
const { data } = await axios.get("https://dramaday.net");
const $ = cheerio.load(data);
let day = 0;
let date = ""
$("div.textwidget > p >").each((_idx, el) => {
const text = $(el).text();
if(text.includes("Monday-Tuesday")){
day = 0;
date = text.replace('(','').replace(')','').split(' ')
} else if(text.includes("Wednesday-Thursday")){
day = 1;
date = text.replace('(','').replace(')','').split(' ')
} else if(text.includes("Friday-Sunday")){
day = 2;
date = text.replace('(','').replace(')','').split(' ')
} else {
const obj = {
name: text,
day,
date
}
element.push(obj);
}
});
return element;
} catch (error) {
throw error;
}
}
The problem is that texts that do not contain a tag are not taken.
See for example in the html code, where it says "It is not taken".
The result I get is this, as you can see there are a lot of empty names, I think it's due to the br
tag.
I can solve it by replacing else
with anelse if
with condition text! == ''
, but I don't solve the problem of text not being included in any tag.
Besides the fact that I can't get the text after the a
tag which would be the episode number and channel.
Can you give me a hand?
[
{ name: '', day: 0, date: [ 'Monday-Tuesday', 'Feb', '21-22' ] },
{
name: 'Moonshine',
day: 0,
date: [ 'Monday-Tuesday', 'Feb', '21-22' ]
},
{ name: '', day: 0, date: [ 'Monday-Tuesday', 'Feb', '21-22' ] },
{
name: 'Ghost Doctor',
day: 0,
date: [ 'Monday-Tuesday', 'Feb', '21-22' ]
},
{ name: '', day: 0, date: [ 'Monday-Tuesday', 'Feb', '21-22' ] },
{ name: '', day: 1, date: [ 'Wednesday-Thursday', 'Feb', '23-24' ] },
{
name: 'Rookie Cops',
day: 1,
date: [ 'Wednesday-Thursday', 'Feb', '23-24' ]
},
{ name: '', day: 1, date: [ 'Wednesday-Thursday', 'Feb', '23-24' ] },
{
name: 'Grid',
day: 1,
date: [ 'Wednesday-Thursday', 'Feb', '23-24' ]
},
{ name: '', day: 1, date: [ 'Wednesday-Thursday', 'Feb', '23-24' ] },
{
name: 'Thirty Nine',
day: 1,
date: [ 'Wednesday-Thursday', 'Feb', '23-24' ]
},
{ name: '', day: 1, date: [ 'Wednesday-Thursday', 'Feb', '23-24' ] },
{ name: '', day: 1, date: [ 'Wednesday-Thursday', 'Feb', '23-24' ] },
{ name: '', day: 1, date: [ 'Wednesday-Thursday', 'Feb', '23-24' ] },
{ name: '', day: 2, date: [ 'Friday-Sunday', 'Feb', '25-27' ] },
{ name: '', day: 2, date: [ 'Friday-Sunday', 'Feb', '25-27' ] },
{
name: 'Through the Darkness',
day: 2,
date: [ 'Friday-Sunday', 'Feb', '25-27' ]
},
{ name: '', day: 2, date: [ 'Friday-Sunday', 'Feb', '25-27' ] },
{
name: 'Twenty Five Twenty One',
day: 2,
date: [ 'Friday-Sunday', 'Feb', '25-27' ]
},
{ name: '', day: 2, date: [ 'Friday-Sunday', 'Feb', '25-27' ] },
{
name: 'Forecasting Love and Weather',
day: 2,
date: [ 'Friday-Sunday', 'Feb', '25-27' ]
},
{ name: '', day: 2, date: [ 'Friday-Sunday', 'Feb', '25-27' ] },
{
name: 'Young Lady and Gentleman',
day: 2,
date: [ 'Friday-Sunday', 'Feb', '25-27' ]
},
{ name: '', day: 2, date: [ 'Friday-Sunday', 'Feb', '25-27' ] },
{
name: 'Love (ft. Marriage and Divorce) S3',
day: 2,
date: [ 'Friday-Sunday', 'Feb', '25-27' ]
}
]
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)