使用 quanteda r 从 dfm 获取零 tf_idf

发布于 2025-01-17 16:29:35 字数 4045 浏览 1 评论 0原文

我想创建一个以 tf_idf 作为权重的文档特征矩阵。如果我像 https://quanteda.io/reference/dfm_tfidf.html 中那样计算 tf_idf ，我只得到零。如果我尝试从同一令牌数据集中获取带有 tidytext 的 tf_idf ，情况也是如此。在我看来，有关语料库中文档数量的信息无法计算。如果我从头开始使用整洁的文本，它就可以工作。

harry_token <-harry_data %>% corpus() %>%  
  tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>% 
  tokens_remove( c(stopwords("english"))) %>%
  tokens_ngrams( n = 1) 


harry_token <-   tokens_replace(tokens(harry_token), pattern = lemma_en$token, replacement = lemma_en$lemma)



harry_token[1]
[1] "Boy"       "live"      "Mr"        "Mrs"       "Dursley"   "number"    "four"      "Privet"    "drive"    
[10] "proud"     "say"       "perfectly"
[ ... and 40,770 more ]



harry_token  %>%
    +   dfm(verbose = F)

Document-feature matrix of: 7 documents, 13,528 features (52.85% sparse) and 1 docvar.
       features
docs    boy live  mr mrs dursley number four privet drive proud
  text1  99   25  81  46     104     19   32     16    31     7
  text2  60   23 135  96      39      8   25      7    15     2
  text3  61   23  60  24      29     18   14      9    21     6
  text4 121   34 521 155      41     35   54     16    26     6
  text5 102   41 240 276      43     69   65     24    42    11
  text6 105   38 102 154      23     34   34      7    15    10
[ reached max_ndoc ... 1 more document, reached max_nfeat ... 13,518 more features ]

harry_token %>%
+   dfm(verbose = F)  %>% 
+   dfm_tfidf()
Document-feature matrix of: 7 documents, 13,528 features (52.85% sparse) and 1 docvar.
       features
docs    boy live mr mrs dursley number four privet drive proud
  text1   0    0  0   0       0      0    0      0     0     0
  text2   0    0  0   0       0      0    0      0     0     0
  text3   0    0  0   0       0      0    0      0     0     0
  text4   0    0  0   0       0      0    0      0     0     0
  text5   0    0  0   0       0      0    0      0     0     0
  text6   0    0  0   0       0      0    0      0     0     0
[ reached max_ndoc ... 1 more document, reached max_nfeat ... 13,518 more features ]




harry_dfm <- harry_token  %>%
              dfm(verbose = FALSE)

tidy(harry_dfm) %>% 
    bind_tf_idf(term = term, document = document, n = count)

# A tibble: 44,646 x 6
   document term    count       tf   idf tf_idf
   <chr>    <chr>   <dbl>    <dbl> <dbl>  <dbl>
 1 text1    boy        99 0.00243      0      0
 2 text1    live       25 0.000613     0      0
 3 text1    mr         81 0.00199      0      0
 4 text1    mrs        46 0.00113      0      0
 5 text1    dursley   104 0.00255      0      0
 6 text1    number     19 0.000466     0      0
 7 text1    four       32 0.000785     0      0
 8 text1    privet     16 0.000392     0      0
 9 text1    drive      31 0.000760     0      0
10 text1    proud       7 0.000172     0      0
# ... with 44,636 more rows

如果我用 tidytext 从头开始计算 tf_idf 就可以了。

 harry_data %>% unnest_tokens(word, text) %>% group_by(title) %>%
+     count(word) %>%
+     bind_tf_idf(word, title, n) 
# A tibble: 67,881 x 6
# Groups:   title [7]
   title              word      n        tf   idf     tf_idf
   <chr>              <chr> <int>     <dbl> <dbl>      <dbl>
 1 Chamber of Secrets 0         1 0.0000117 0.847 0.00000992
 2 Chamber of Secrets 1         6 0.0000703 0.154 0.0000108 
 3 Chamber of Secrets 1,5       1 0.0000117 1.95  0.0000228 
 4 Chamber of Secrets 1,520     1 0.0000117 1.95  0.0000228 
 5 Chamber of Secrets 100       1 0.0000117 1.95  0.0000228 
 6 Chamber of Secrets 101       1 0.0000117 1.95  0.0000228 
 7 Chamber of Secrets 102       1 0.0000117 1.95  0.0000228 
 8 Chamber of Secrets 104       1 0.0000117 1.95  0.0000228 
 9 Chamber of Secrets 105       1 0.0000117 1.95  0.0000228 
10 Chamber of Secrets 106       1 0.0000117 1.95  0.0000228

原文

I want to create a Document-feature matrix with tf_idf as weights. If I calculate the tf_idf like in https://quanteda.io/reference/dfm_tfidf.html, I get only zeros. The same if I try to get tf_idf with tidytext from the same token dataset. Looks to me like the information about the number of documents in the corpus cannot be calculated. If i usetidy text from scratch, it works.

harry_token <-harry_data %>% corpus() %>%  
  tokens(remove_punct = TRUE, remove_numbers = TRUE, remove_symbols = TRUE) %>% 
  tokens_remove( c(stopwords("english"))) %>%
  tokens_ngrams( n = 1) 


harry_token <-   tokens_replace(tokens(harry_token), pattern = lemma_en$token, replacement = lemma_en$lemma)



harry_token[1]
[1] "Boy"       "live"      "Mr"        "Mrs"       "Dursley"   "number"    "four"      "Privet"    "drive"    
[10] "proud"     "say"       "perfectly"
[ ... and 40,770 more ]



harry_token  %>%
    +   dfm(verbose = F)

Document-feature matrix of: 7 documents, 13,528 features (52.85% sparse) and 1 docvar.
       features
docs    boy live  mr mrs dursley number four privet drive proud
  text1  99   25  81  46     104     19   32     16    31     7
  text2  60   23 135  96      39      8   25      7    15     2
  text3  61   23  60  24      29     18   14      9    21     6
  text4 121   34 521 155      41     35   54     16    26     6
  text5 102   41 240 276      43     69   65     24    42    11
  text6 105   38 102 154      23     34   34      7    15    10
[ reached max_ndoc ... 1 more document, reached max_nfeat ... 13,518 more features ]

harry_token %>%
+   dfm(verbose = F)  %>% 
+   dfm_tfidf()
Document-feature matrix of: 7 documents, 13,528 features (52.85% sparse) and 1 docvar.
       features
docs    boy live mr mrs dursley number four privet drive proud
  text1   0    0  0   0       0      0    0      0     0     0
  text2   0    0  0   0       0      0    0      0     0     0
  text3   0    0  0   0       0      0    0      0     0     0
  text4   0    0  0   0       0      0    0      0     0     0
  text5   0    0  0   0       0      0    0      0     0     0
  text6   0    0  0   0       0      0    0      0     0     0
[ reached max_ndoc ... 1 more document, reached max_nfeat ... 13,518 more features ]




harry_dfm <- harry_token  %>%
              dfm(verbose = FALSE)

tidy(harry_dfm) %>% 
    bind_tf_idf(term = term, document = document, n = count)

# A tibble: 44,646 x 6
   document term    count       tf   idf tf_idf
   <chr>    <chr>   <dbl>    <dbl> <dbl>  <dbl>
 1 text1    boy        99 0.00243      0      0
 2 text1    live       25 0.000613     0      0
 3 text1    mr         81 0.00199      0      0
 4 text1    mrs        46 0.00113      0      0
 5 text1    dursley   104 0.00255      0      0
 6 text1    number     19 0.000466     0      0
 7 text1    four       32 0.000785     0      0
 8 text1    privet     16 0.000392     0      0
 9 text1    drive      31 0.000760     0      0
10 text1    proud       7 0.000172     0      0
# ... with 44,636 more rows

If I calculate tf_idf from scratch with tidytext ist works.

 harry_data %>% unnest_tokens(word, text) %>% group_by(title) %>%
+     count(word) %>%
+     bind_tf_idf(word, title, n) 
# A tibble: 67,881 x 6
# Groups:   title [7]
   title              word      n        tf   idf     tf_idf
   <chr>              <chr> <int>     <dbl> <dbl>      <dbl>
 1 Chamber of Secrets 0         1 0.0000117 0.847 0.00000992
 2 Chamber of Secrets 1         6 0.0000703 0.154 0.0000108 
 3 Chamber of Secrets 1,5       1 0.0000117 1.95  0.0000228 
 4 Chamber of Secrets 1,520     1 0.0000117 1.95  0.0000228 
 5 Chamber of Secrets 100       1 0.0000117 1.95  0.0000228 
 6 Chamber of Secrets 101       1 0.0000117 1.95  0.0000228 
 7 Chamber of Secrets 102       1 0.0000117 1.95  0.0000228 
 8 Chamber of Secrets 104       1 0.0000117 1.95  0.0000228 
 9 Chamber of Secrets 105       1 0.0000117 1.95  0.0000228 
10 Chamber of Secrets 106       1 0.0000117 1.95  0.0000228

分享到QQ

分享到微博