为什么 apache Mahout 频繁模式 minnig 算法仅返回 1 项项集?

发布于 2024-12-17 09:44:34 字数 6822 浏览 2 评论 0原文

我目前正在测试 Apache Mahout Parallel 频繁模式挖掘 。在实际项目中使用它之前,我从一个简单的代码开始,只是为了确保它按照我的预期工作......

我没有找到包含代码、数据和输出的完整示例。

我当前有一个编译和执行版本(请参见下面的 java / scala 代码),但返回的频繁模式仅包含一个元组(请参见下面的示例输出)。

这是预期的行为吗? 我做错了什么?

感谢您的帮助...

scala 代码:

  import org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth
  import java.util.HashSet
  import org.apache.mahout.common.iterator.StringRecordIterator
  import org.apache.mahout.common.iterator.FileLineIterable
  import org.apache.mahout.fpm.pfpgrowth.convertors._
  import org.apache.mahout.fpm.pfpgrowth.convertors.integer._
  import org.apache.mahout.fpm.pfpgrowth.convertors.string._
  import org.apache.hadoop.io.SequenceFile.Writer
  import org.apache.mahout.fpm.pfpgrowth.convertors.StatusUpdater
  import org.apache.hadoop.mapred.OutputCollector
  import scala.collection.JavaConversions._
  import java.util.{ List => JList }
  import org.apache.mahout.common.{ Pair => JPair }
  import java.lang.{ Long => JLong }
  import org.apache.hadoop.io.{ Text => JText }

  val minSupport = 5L
  val k: Int = 50
  val fps: FPGrowth[String] = new FPGrowth[String]()

  val milk = "milk"
  val bread = "bread"
  val butter = "butter"
  val bier = "bier"

  val transactionStream: Iterator[JPair[JList[String], JLong]] = Iterator(
    new JPair(List(milk, bread), 10L),
    new JPair(List(butter), 10L),
    new JPair(List(bier), 10L),
    new JPair(List(milk, bread, butter), 5L),
    new JPair(List(milk, bread, bier), 5L),
    new JPair(List(bread), 10L)
  )

  val frequencies: Collection[JPair[String, JLong]] = fps.generateFList(
    transactionStream, minSupport.toInt)

  println("freqList :" + frequencies)

  var returnableFeatures: Collection[String] = List(
    milk, bread, butter, bier)

  var output: OutputCollector[String, JList[JPair[JList[String], JLong]]] = (
    new OutputCollector[String, JList[JPair[JList[String], JLong]]] {
      def collect(x1: String,
                  x2: JList[JPair[JList[String], JLong]]) = {
        println(x1 + ":" +
          x2.map(pair => "[" + pair.getFirst.mkString(",") + "] : " +
            pair.getSecond).mkString("; "))
      }
    }
  )

  val updater: StatusUpdater = new StatusUpdater {
    def update(status: String) = println("updater : " + status)
  }

  fps.generateTopKFrequentPatterns(
    transactionStream,
    frequencies,
    minSupport,
    k,
    null, //returnableFeatures
    output,
    updater)

java 代码:

import org.apache.mahout.fpm.pfpgrowth.fpgrowth.*;

import java.io.IOException;
import java.util.*;

import org.apache.mahout.common.iterator.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.integer.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.*;

import org.apache.hadoop.mapred.OutputCollector;
import org.apache.mahout.common.*;

import org.apache.hadoop.io.Text;

class FPGrowthDemo {

  public static void main(String[] args) {
    long minSupport = 1L;
    int k = 50;
    FPGrowth<String> fps = new FPGrowth<String>();

    String milk = "milk";
    String bread = "bread";
    String butter = "butter";
    String bier = "bier";

    LinkedList<Pair<List<String>, Long>> data = 
        new LinkedList<Pair<List<String>, Long>>();

    data.add(new Pair(Arrays.asList(milk, bread), 1L));
    data.add(new Pair(Arrays.asList(butter), 1L));
    data.add(new Pair(Arrays.asList(bier), 1L));
    data.add(new Pair(Arrays.asList(milk, bread, butter), 1L));
    data.add(new Pair(Arrays.asList(milk, bread, bier), 1L));
    data.add(new Pair(Arrays.asList(milk, bread), 1L));

    Iterator<Pair<List<String>, Long>> transactions = data.iterator();

    Collection<Pair<String, Long>> frequencies = fps.generateFList(
        transactions, (int) minSupport);

    System.out.println("freqList :" + frequencies);

    Collection<String> returnableFeatures = 
        Arrays.asList(milk, bread, butter, bier);

    OutputCollector<String, List<Pair<List<String>, Long>>> output = 
      new OutputCollector<String, List<Pair<List<String>, Long>>>() {
        @Override
        public void collect(String x1, 
                            List<Pair<List<String>, Long>> listPair)
            throws IOException {
          StringBuffer sb = new StringBuffer();
          sb.append(x1 + ":");
          for (Pair<List<String>, Long> pair : listPair) {
            sb.append("[");
            String sep = "";
            for (String item : pair.getFirst()) {
              sb.append(item + sep);
              sep = ", ";
            }
            sb.append("]:" + pair.getSecond());
          }
          System.out.println("  " + sb.toString());
        }
    };


    StatusUpdater updater = new StatusUpdater() { 
      public void update(String status){
        System.out.println("updater :" + status); 
      }
    };

    try {
    fps.generateTopKFrequentPatterns(
        transactions,
        frequencies,
        minSupport, 
        k,
        null, //returnableFeatures
        output, 
        updater);
    }catch (Exception e){
      e.printStackTrace();
    }

  }
}

示例输出:

freqList :[(bread,4), (milk,4), (bier,2), (butter,2)]
17:48:19,108 INFO  ~ Number of unique items 4
17:48:19,109 INFO  ~ Number of unique pruned items 4
17:48:19,121 INFO  ~ Number of Nodes in the FP Tree: 0
17:48:19,122 INFO  ~ Mining FTree Tree for all patterns with 3
updater :FPGrowth Algorithm for a given feature: 3
  butter:[butter]:2
17:48:19,130 INFO  ~ Found 1 Patterns with Least Support 2
17:48:19,130 INFO  ~ Mining FTree Tree for all patterns with 2
updater :FPGrowth Algorithm for a given feature: 2
updater :FPGrowth Algorithm for a given feature: 3
  bier:[bier]:2
17:48:19,130 INFO  ~ Found 1 Patterns with Least Support 2
17:48:19,130 INFO  ~ Mining FTree Tree for all patterns with 1
updater :FPGrowth Algorithm for a given feature: 1
updater :FPGrowth Algorithm for a given feature: 2
updater :FPGrowth Algorithm for a given feature: 3
  milk:[milk]:4
17:48:19,131 INFO  ~ Found 1 Patterns with Least Support 4
17:48:19,131 INFO  ~ Mining FTree Tree for all patterns with 0
updater :FPGrowth Algorithm for a given feature: 0
updater :FPGrowth Algorithm for a given feature: 1
updater :FPGrowth Algorithm for a given feature: 2
updater :FPGrowth Algorithm for a given feature: 3
  bread:[bread]:4
17:48:19,131 INFO  ~ Found 1 Patterns with Least Support 4
17:48:19,131 INFO  ~ Tree Cache: First Level: Cache hits=6 Cache Misses=4

I'm currently testing Apache Mahout Parallel Frequent Pattern Mining . Before using it in the real project, I started with a simple code, just to be sure it works as I expect it to do...

I did not find complete example with code, data, and output.

I have currently a compiling and executing version (see java / scala code below), but the returned frequent patterns contain only one tuple (see sample output below).

Is this the intended behavior?
What did I do wrong?

Thanks for your help...

scala code :

  import org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth
  import java.util.HashSet
  import org.apache.mahout.common.iterator.StringRecordIterator
  import org.apache.mahout.common.iterator.FileLineIterable
  import org.apache.mahout.fpm.pfpgrowth.convertors._
  import org.apache.mahout.fpm.pfpgrowth.convertors.integer._
  import org.apache.mahout.fpm.pfpgrowth.convertors.string._
  import org.apache.hadoop.io.SequenceFile.Writer
  import org.apache.mahout.fpm.pfpgrowth.convertors.StatusUpdater
  import org.apache.hadoop.mapred.OutputCollector
  import scala.collection.JavaConversions._
  import java.util.{ List => JList }
  import org.apache.mahout.common.{ Pair => JPair }
  import java.lang.{ Long => JLong }
  import org.apache.hadoop.io.{ Text => JText }

  val minSupport = 5L
  val k: Int = 50
  val fps: FPGrowth[String] = new FPGrowth[String]()

  val milk = "milk"
  val bread = "bread"
  val butter = "butter"
  val bier = "bier"

  val transactionStream: Iterator[JPair[JList[String], JLong]] = Iterator(
    new JPair(List(milk, bread), 10L),
    new JPair(List(butter), 10L),
    new JPair(List(bier), 10L),
    new JPair(List(milk, bread, butter), 5L),
    new JPair(List(milk, bread, bier), 5L),
    new JPair(List(bread), 10L)
  )

  val frequencies: Collection[JPair[String, JLong]] = fps.generateFList(
    transactionStream, minSupport.toInt)

  println("freqList :" + frequencies)

  var returnableFeatures: Collection[String] = List(
    milk, bread, butter, bier)

  var output: OutputCollector[String, JList[JPair[JList[String], JLong]]] = (
    new OutputCollector[String, JList[JPair[JList[String], JLong]]] {
      def collect(x1: String,
                  x2: JList[JPair[JList[String], JLong]]) = {
        println(x1 + ":" +
          x2.map(pair => "[" + pair.getFirst.mkString(",") + "] : " +
            pair.getSecond).mkString("; "))
      }
    }
  )

  val updater: StatusUpdater = new StatusUpdater {
    def update(status: String) = println("updater : " + status)
  }

  fps.generateTopKFrequentPatterns(
    transactionStream,
    frequencies,
    minSupport,
    k,
    null, //returnableFeatures
    output,
    updater)

java code :

import org.apache.mahout.fpm.pfpgrowth.fpgrowth.*;

import java.io.IOException;
import java.util.*;

import org.apache.mahout.common.iterator.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.integer.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.*;

import org.apache.hadoop.mapred.OutputCollector;
import org.apache.mahout.common.*;

import org.apache.hadoop.io.Text;

class FPGrowthDemo {

  public static void main(String[] args) {
    long minSupport = 1L;
    int k = 50;
    FPGrowth<String> fps = new FPGrowth<String>();

    String milk = "milk";
    String bread = "bread";
    String butter = "butter";
    String bier = "bier";

    LinkedList<Pair<List<String>, Long>> data = 
        new LinkedList<Pair<List<String>, Long>>();

    data.add(new Pair(Arrays.asList(milk, bread), 1L));
    data.add(new Pair(Arrays.asList(butter), 1L));
    data.add(new Pair(Arrays.asList(bier), 1L));
    data.add(new Pair(Arrays.asList(milk, bread, butter), 1L));
    data.add(new Pair(Arrays.asList(milk, bread, bier), 1L));
    data.add(new Pair(Arrays.asList(milk, bread), 1L));

    Iterator<Pair<List<String>, Long>> transactions = data.iterator();

    Collection<Pair<String, Long>> frequencies = fps.generateFList(
        transactions, (int) minSupport);

    System.out.println("freqList :" + frequencies);

    Collection<String> returnableFeatures = 
        Arrays.asList(milk, bread, butter, bier);

    OutputCollector<String, List<Pair<List<String>, Long>>> output = 
      new OutputCollector<String, List<Pair<List<String>, Long>>>() {
        @Override
        public void collect(String x1, 
                            List<Pair<List<String>, Long>> listPair)
            throws IOException {
          StringBuffer sb = new StringBuffer();
          sb.append(x1 + ":");
          for (Pair<List<String>, Long> pair : listPair) {
            sb.append("[");
            String sep = "";
            for (String item : pair.getFirst()) {
              sb.append(item + sep);
              sep = ", ";
            }
            sb.append("]:" + pair.getSecond());
          }
          System.out.println("  " + sb.toString());
        }
    };


    StatusUpdater updater = new StatusUpdater() { 
      public void update(String status){
        System.out.println("updater :" + status); 
      }
    };

    try {
    fps.generateTopKFrequentPatterns(
        transactions,
        frequencies,
        minSupport, 
        k,
        null, //returnableFeatures
        output, 
        updater);
    }catch (Exception e){
      e.printStackTrace();
    }

  }
}

sample output :

freqList :[(bread,4), (milk,4), (bier,2), (butter,2)]
17:48:19,108 INFO  ~ Number of unique items 4
17:48:19,109 INFO  ~ Number of unique pruned items 4
17:48:19,121 INFO  ~ Number of Nodes in the FP Tree: 0
17:48:19,122 INFO  ~ Mining FTree Tree for all patterns with 3
updater :FPGrowth Algorithm for a given feature: 3
  butter:[butter]:2
17:48:19,130 INFO  ~ Found 1 Patterns with Least Support 2
17:48:19,130 INFO  ~ Mining FTree Tree for all patterns with 2
updater :FPGrowth Algorithm for a given feature: 2
updater :FPGrowth Algorithm for a given feature: 3
  bier:[bier]:2
17:48:19,130 INFO  ~ Found 1 Patterns with Least Support 2
17:48:19,130 INFO  ~ Mining FTree Tree for all patterns with 1
updater :FPGrowth Algorithm for a given feature: 1
updater :FPGrowth Algorithm for a given feature: 2
updater :FPGrowth Algorithm for a given feature: 3
  milk:[milk]:4
17:48:19,131 INFO  ~ Found 1 Patterns with Least Support 4
17:48:19,131 INFO  ~ Mining FTree Tree for all patterns with 0
updater :FPGrowth Algorithm for a given feature: 0
updater :FPGrowth Algorithm for a given feature: 1
updater :FPGrowth Algorithm for a given feature: 2
updater :FPGrowth Algorithm for a given feature: 3
  bread:[bread]:4
17:48:19,131 INFO  ~ Found 1 Patterns with Least Support 4
17:48:19,131 INFO  ~ Tree Cache: First Level: Cache hits=6 Cache Misses=4

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

霓裳挽歌倾城醉 2024-12-24 09:44:34

该代码有缺陷:首先调用事务上的迭代器来计算频率,然后由 fp-growth 算法再次调用。问题是第二次调用不会返回任何值,因为迭代器已到达末尾...

作为参考,这里是正确的 java 代码:

import org.apache.mahout.fpm.pfpgrowth.fpgrowth.*;

import java.io.IOException;
import java.util.*;

import org.apache.mahout.common.iterator.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.integer.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.*;

import org.apache.hadoop.mapred.OutputCollector;
import org.apache.mahout.common.*;

import org.apache.hadoop.io.Text;

class FPGrowthDemo {

  public static void main(String[] args) {
    long minSupport = 1L;
    int k = 50;
    FPGrowth<String> fps = new FPGrowth<String>();

    String milk = "milk";
    String bread = "bread";
    String butter = "butter";
    String bier = "bier";

    LinkedList<Pair<List<String>, Long>> data = 
        new LinkedList<Pair<List<String>, Long>>();

    data.add(new Pair(Arrays.asList(milk, bread), 1L));
    data.add(new Pair(Arrays.asList(butter), 1L));
    data.add(new Pair(Arrays.asList(bier), 1L));
    data.add(new Pair(Arrays.asList(milk, bread, butter), 1L));
    data.add(new Pair(Arrays.asList(milk, bread, bier), 1L));
    data.add(new Pair(Arrays.asList(milk, bread), 1L));

    // This lines is removed...
    // Iterator<Pair<List<String>, Long>> transactions = data.iterator();

    Collection<Pair<String, Long>> frequencies = fps.generateFList(
        data.iterator(), // use an iterator here...
        (int) minSupport);

    System.out.println("freqList :" + frequencies);

    OutputCollector<String, List<Pair<List<String>, Long>>> output = 
        new OutputCollector<String, List<Pair<List<String>, Long>>>() {

      @Override
      public void collect(String x1, 
                          List<Pair<List<String>, Long>> listPair)
          throws IOException {
        StringBuffer sb = new StringBuffer();
        sb.append(x1 + ":");
        for (Pair<List<String>, Long> pair : listPair) {

          sb.append("[");
          String sep = "";
          for (String item : pair.getFirst()) {
            sb.append(item + sep);
            sep = ", ";
          }
          sb.append("]:" + pair.getSecond());
        }
        System.out.println("  " + sb.toString());
      }
    };

    StatusUpdater updater = new StatusUpdater() {
      public void update(String status) {
        System.out.println("updater :" + status);
      }
    };

    try {
      fps.generateTopKFrequentPatterns(
        // changed here (previously : transactions)
        data.iterator(), // use a "fresh" iterator
        frequencies, 
        minSupport, 
        k, 
        null, 
        output, 
        updater);
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
}

The code is buggy : the iterator on transactions is called first to compute the frequencies, and will be called again by the fp-growth algorithm. The problem is that this second call will return no value, because the iterator has reached its end...

For reference, here is the correct java code :

import org.apache.mahout.fpm.pfpgrowth.fpgrowth.*;

import java.io.IOException;
import java.util.*;

import org.apache.mahout.common.iterator.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.integer.*;
import org.apache.mahout.fpm.pfpgrowth.convertors.string.*;

import org.apache.hadoop.mapred.OutputCollector;
import org.apache.mahout.common.*;

import org.apache.hadoop.io.Text;

class FPGrowthDemo {

  public static void main(String[] args) {
    long minSupport = 1L;
    int k = 50;
    FPGrowth<String> fps = new FPGrowth<String>();

    String milk = "milk";
    String bread = "bread";
    String butter = "butter";
    String bier = "bier";

    LinkedList<Pair<List<String>, Long>> data = 
        new LinkedList<Pair<List<String>, Long>>();

    data.add(new Pair(Arrays.asList(milk, bread), 1L));
    data.add(new Pair(Arrays.asList(butter), 1L));
    data.add(new Pair(Arrays.asList(bier), 1L));
    data.add(new Pair(Arrays.asList(milk, bread, butter), 1L));
    data.add(new Pair(Arrays.asList(milk, bread, bier), 1L));
    data.add(new Pair(Arrays.asList(milk, bread), 1L));

    // This lines is removed...
    // Iterator<Pair<List<String>, Long>> transactions = data.iterator();

    Collection<Pair<String, Long>> frequencies = fps.generateFList(
        data.iterator(), // use an iterator here...
        (int) minSupport);

    System.out.println("freqList :" + frequencies);

    OutputCollector<String, List<Pair<List<String>, Long>>> output = 
        new OutputCollector<String, List<Pair<List<String>, Long>>>() {

      @Override
      public void collect(String x1, 
                          List<Pair<List<String>, Long>> listPair)
          throws IOException {
        StringBuffer sb = new StringBuffer();
        sb.append(x1 + ":");
        for (Pair<List<String>, Long> pair : listPair) {

          sb.append("[");
          String sep = "";
          for (String item : pair.getFirst()) {
            sb.append(item + sep);
            sep = ", ";
          }
          sb.append("]:" + pair.getSecond());
        }
        System.out.println("  " + sb.toString());
      }
    };

    StatusUpdater updater = new StatusUpdater() {
      public void update(String status) {
        System.out.println("updater :" + status);
      }
    };

    try {
      fps.generateTopKFrequentPatterns(
        // changed here (previously : transactions)
        data.iterator(), // use a "fresh" iterator
        frequencies, 
        minSupport, 
        k, 
        null, 
        output, 
        updater);
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
}
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文