如何使用Quanteda计算两组单个文档之间的余弦相似性？

发布于 2025-02-10 13:55:01 字数 1837 浏览 2 评论 0原文

我有两套文档：一组约。 580篇新闻文章，其中一篇约为。 560政治决定。我想找出个人新闻文章与政治决定之间是否有相似之处。这意味着应使用余弦相似性将每个新闻文章与560个政治决定中的每一个进行比较。我正在使用Quanteda软件包。

这是我到目前为止尝试的：

news_articles <- readtext(paste0(txt_directory, "*"), encoding = "UTF-8")
news_articles_corpus <- corpus(news_articles)

pol_decisions <- readtext(paste0(txt_directory, "*"), encoding = "UTF-8")
pol_decisions_corpus <- corpus(pol_decisions)

news_articles_toks <- tokens(
  news_articles_corpus,
  what = "word",
  remove_punct = TRUE,
  remove_symbols = TRUE,
  remove_numbers = TRUE,
  remove_url = TRUE,
  remove_separators = TRUE,
  verbose = TRUE)

news_articles_toks <- tokens_tolower(news_articles_toks, keep_acronyms = FALSE)
news_articles_toks <- tokens_select(news_articles_toks, stopwords("danish"), selection = "remove")
news_articles_toks <- tokens_wordstem(news_articles_toks)

pol_decisions_toks <- tokens(
  pol_decisions_corpus,
  what = "word",
  remove_punct = TRUE,
  remove_symbols = TRUE,
  remove_numbers = TRUE,
  remove_url = TRUE,
  remove_separators = TRUE,
  verbose = TRUE)

pol_decisions_toks <- tokens_tolower(pol_decisions_toks, keep_acronyms = FALSE)
pol_decisions_toks <- tokens_select(pol_decisions_toks, stopwords("danish"), selection = "remove")
pol_decisions_toks <- tokens_wordstem(pol_decisions_toks)

news_articles_dfm <- dfm(news_articles_toks)
pol_decisions_dfm <- dfm(pol_decisions_toks)

cosine <- textstat_simil(
  news_articles_dfm,
  y = pol_decisions_dfm,
  selection = NULL,
  margin = c("documents"),
  method = c("cosine"))

cosine <- as.data.frame(cosine)
cosine <- cosine[order(-cosine$cosine),]
write_xlsx(cosine, "Test.xlsx")

我的问题是，当我运行textStat_simil函数时，r返回所有组合的余弦值，包括两组文档之间和之间。但是我不想知道两篇新闻文章或两个政治决定之间的余弦相似性。我只想知道新闻文章和政治决定之间的余弦相似性。

有什么方法可以解决这个问题吗？

原文

I have two sets of documents: One with approx. 580 news articles and one with approx. 560 political decisions. I want to find out whether there are similarities between the individual news articles and the political decisions. This means that each individual news article should be compared with each of the 560 political decisions, using cosine similarity. I am using the quanteda package.

This is what I have tried so far:

news_articles <- readtext(paste0(txt_directory, "*"), encoding = "UTF-8")
news_articles_corpus <- corpus(news_articles)

pol_decisions <- readtext(paste0(txt_directory, "*"), encoding = "UTF-8")
pol_decisions_corpus <- corpus(pol_decisions)

news_articles_toks <- tokens(
  news_articles_corpus,
  what = "word",
  remove_punct = TRUE,
  remove_symbols = TRUE,
  remove_numbers = TRUE,
  remove_url = TRUE,
  remove_separators = TRUE,
  verbose = TRUE)

news_articles_toks <- tokens_tolower(news_articles_toks, keep_acronyms = FALSE)
news_articles_toks <- tokens_select(news_articles_toks, stopwords("danish"), selection = "remove")
news_articles_toks <- tokens_wordstem(news_articles_toks)

pol_decisions_toks <- tokens(
  pol_decisions_corpus,
  what = "word",
  remove_punct = TRUE,
  remove_symbols = TRUE,
  remove_numbers = TRUE,
  remove_url = TRUE,
  remove_separators = TRUE,
  verbose = TRUE)

pol_decisions_toks <- tokens_tolower(pol_decisions_toks, keep_acronyms = FALSE)
pol_decisions_toks <- tokens_select(pol_decisions_toks, stopwords("danish"), selection = "remove")
pol_decisions_toks <- tokens_wordstem(pol_decisions_toks)

news_articles_dfm <- dfm(news_articles_toks)
pol_decisions_dfm <- dfm(pol_decisions_toks)

cosine <- textstat_simil(
  news_articles_dfm,
  y = pol_decisions_dfm,
  selection = NULL,
  margin = c("documents"),
  method = c("cosine"))

cosine <- as.data.frame(cosine)
cosine <- cosine[order(-cosine$cosine),]
write_xlsx(cosine, "Test.xlsx")

My problem is that when I run the textstat_simil function, R returns cosine values for all combinations - both within and between the two sets of documents. But I don't want to know the cosine similarity between two news articles or between two political decisions. I only want to know the cosine similarity between a news article and a political decision.

Is there any way to solve this issue?

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

虐人心 2025-02-17 13:55:01

仅使用x和y in textstat_simil（）。

require(quanteda)
#> Loading required package: quanteda
#> Package version: 3.2.1
#> Unicode version: 13.0
#> ICU version: 69.1
#> Parallel computing: 4 of 4 threads used.
#> See https://quanteda.io for tutorials and examples.
require(quanteda.textstats)
#> Loading required package: quanteda.textstats

corp_news <- corpus(c(news1 = "politics party vote", 
                      news2 = "crime police family"))
corp_pol <- corpus(c(pol1 = "member party vote", 
                     pol2 = "family income", 
                     pol3 = "crime prison"))

dfmt_news <- tokens(corp_news) %>% dfm()
dfmt_pol <- tokens(corp_pol) %>% dfm()

dfmt_news
#> Document-feature matrix of: 2 documents, 6 features (50.00% sparse) and 0 docvars.
#>        features
#> docs    politics party vote crime police family
#>   news1        1     1    1     0      0      0
#>   news2        0     0    0     1      1      1
dfmt_pol
#> Document-feature matrix of: 3 documents, 7 features (66.67% sparse) and 0 docvars.
#>       features
#> docs   member party vote family income crime prison
#>   pol1      1     1    1      0      0     0      0
#>   pol2      0     0    0      1      1     0      0
#>   pol3      0     0    0      0      0     1      1

textstat_simil(x = dfmt_news, y = dfmt_pol, method = "cosine")
#> textstat_simil object; method = "cosine"
#>        pol1  pol2  pol3
#> news1 0.667     0     0
#> news2     0 0.408 0.408

^由

Only use x and y in textstat_simil().

require(quanteda)
#> Loading required package: quanteda
#> Package version: 3.2.1
#> Unicode version: 13.0
#> ICU version: 69.1
#> Parallel computing: 4 of 4 threads used.
#> See https://quanteda.io for tutorials and examples.
require(quanteda.textstats)
#> Loading required package: quanteda.textstats

corp_news <- corpus(c(news1 = "politics party vote", 
                      news2 = "crime police family"))
corp_pol <- corpus(c(pol1 = "member party vote", 
                     pol2 = "family income", 
                     pol3 = "crime prison"))

dfmt_news <- tokens(corp_news) %>% dfm()
dfmt_pol <- tokens(corp_pol) %>% dfm()

dfmt_news
#> Document-feature matrix of: 2 documents, 6 features (50.00% sparse) and 0 docvars.
#>        features
#> docs    politics party vote crime police family
#>   news1        1     1    1     0      0      0
#>   news2        0     0    0     1      1      1
dfmt_pol
#> Document-feature matrix of: 3 documents, 7 features (66.67% sparse) and 0 docvars.
#>       features
#> docs   member party vote family income crime prison
#>   pol1      1     1    1      0      0     0      0
#>   pol2      0     0    0      1      1     0      0
#>   pol3      0     0    0      0      0     1      1

textstat_simil(x = dfmt_news, y = dfmt_pol, method = "cosine")
#> textstat_simil object; method = "cosine"
#>        pol1  pol2  pol3
#> news1 0.667     0     0
#> news2     0 0.408 0.408

^{Created on 2022-06-25 by the reprex package (v2.0.1)}

回复收藏 0 原文

~没有更多了~