返回介绍

lcci / 17.26.Sparse Similarity / README_EN

发布于 2024-06-17 01:04:42 字数 5418 浏览 0 评论 0 收藏 0

17.26. Sparse Similarity

中文文档

Description

The similarity of two documents (each with distinct words) is defined to be the size of the intersection divided by the size of the union. For example, if the documents consist of integers, the similarity of {1, 5, 3} and {1, 7, 2, 3} is 0.4, because the intersection has size 2 and the union has size 5. We have a long list of documents (with distinct values and each with an associated ID) where the similarity is believed to be "sparse". That is, any two arbitrarily selected documents are very likely to have similarity 0. Design an algorithm that returns a list of pairs of document IDs and the associated similarity.

Input is a 2D array docs, where docs[i] is the document with id i. Return an array of strings, where each string represents a pair of documents with similarity greater than 0. The string should be formatted as  {id1},{id2}: {similarity}, where id1 is the smaller id in the two documents, and similarity is the similarity rounded to four decimal places. You can return the array in any order.

Example:


Input:

[

  [14, 15, 100, 9, 3],

  [32, 1, 9, 3, 5],

  [15, 29, 2, 6, 8, 7],

  [7, 10]

]

Output:

[

  "0,1: 0.2500",

  "0,2: 0.1000",

  "2,3: 0.1429"

]

Note:

  • docs.length <= 500
  • docs[i].length <= 500
  • The number of document pairs with similarity greater than 0 will not exceed 1000.

Solutions

Solution 1

class Solution:
  def computeSimilarities(self, docs: List[List[int]]) -> List[str]:
    eps = 1e-9
    d = defaultdict(list)
    for i, v in enumerate(docs):
      for x in v:
        d[x].append(i)
    cnt = Counter()
    for ids in d.values():
      n = len(ids)
      for i in range(n):
        for j in range(i + 1, n):
          cnt[(ids[i], ids[j])] += 1
    ans = []
    for (i, j), v in cnt.items():
      tot = len(docs[i]) + len(docs[j]) - v
      x = v / tot + eps
      ans.append(f'{i},{j}: {x:.4f}')
    return ans
class Solution {
  public List<String> computeSimilarities(int[][] docs) {
    Map<Integer, List<Integer>> d = new HashMap<>();
    for (int i = 0; i < docs.length; ++i) {
      for (int v : docs[i]) {
        d.computeIfAbsent(v, k -> new ArrayList<>()).add(i);
      }
    }
    Map<String, Integer> cnt = new HashMap<>();
    for (var ids : d.values()) {
      int n = ids.size();
      for (int i = 0; i < n; ++i) {
        for (int j = i + 1; j < n; ++j) {
          String k = ids.get(i) + "," + ids.get(j);
          cnt.put(k, cnt.getOrDefault(k, 0) + 1);
        }
      }
    }
    List<String> ans = new ArrayList<>();
    for (var e : cnt.entrySet()) {
      String k = e.getKey();
      int v = e.getValue();
      String[] t = k.split(",");
      int i = Integer.parseInt(t[0]), j = Integer.parseInt(t[1]);
      int tot = docs[i].length + docs[j].length - v;
      double x = (double) v / tot;
      ans.add(String.format("%s: %.4f", k, x));
    }
    return ans;
  }
}
using pii = pair<int, int>;

class Solution {
public:
  vector<string> computeSimilarities(vector<vector<int>>& docs) {
    double eps = 1e-9;
    unordered_map<int, vector<int>> d;
    for (int i = 0; i < docs.size(); ++i) {
      for (int v : docs[i]) {
        d[v].push_back(i);
      }
    }
    map<pii, int> cnt;
    for (auto& [_, ids] : d) {
      int n = ids.size();
      for (int i = 0; i < n; ++i) {
        for (int j = i + 1; j < n; ++j) {
          cnt[{ids[i], ids[j]}]++;
        }
      }
    }
    vector<string> ans;
    for (auto& [k, v] : cnt) {
      auto [i, j] = k;
      int tot = docs[i].size() + docs[j].size() - v;
      double x = (double) v / tot + eps;
      char t[20];
      sprintf(t, "%d,%d: %0.4lf", i, j, x);
      ans.push_back(t);
    }
    return ans;
  }
};
func computeSimilarities(docs [][]int) []string {
  d := map[int][]int{}
  for i, v := range docs {
    for _, x := range v {
      d[x] = append(d[x], i)
    }
  }
  type pair struct{ i, j int }
  cnt := map[pair]int{}
  for _, ids := range d {
    n := len(ids)
    for i := 0; i < n; i++ {
      for j := i + 1; j < n; j++ {
        k := pair{ids[i], ids[j]}
        cnt[k]++
      }
    }
  }
  ans := []string{}
  for k, v := range cnt {
    i, j := k.i, k.j
    tot := len(docs[i]) + len(docs[j]) - v
    x := float64(v)/float64(tot) + 1e-9
    ans = append(ans, strconv.Itoa(i)+","+strconv.Itoa(j)+": "+fmt.Sprintf("%.4f", x))
  }
  return ans
}

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。
列表为空,暂无数据
    我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
    原文