JNI 分段错误 bug
有一个名为 ocamorph 的形态分析器(开源,用 OCml 编写)。 在此处下载并提供说明
java 绑定有问题,我必须修复它,之后经过几个小时的努力,现在在我看来需要几天时间才能修复它,因为我不熟悉 C、JNI、OCml 和这个特定的软件。
在这里您可以看到,对于小文件 (subtitles_136.hu.tok) 它可以工作,但对于较大的文件 (Tolkien_1.hu.tok) 会抛出“分段错误”:
bpgergo@krusovice:~/hunglish_tools/ocamorph/ocamorph/src/bindings/java $ java -Djava.library.path=./output/ -cp output mokk.nlp.ocamorph.FileStemmer $HULEXICON src/java/mokk/nlp/ocamorph/cache2.txt > src/java/mokk/nlp/ocamorph/subtitles_136.hu.stem < src/java/mokk/nlp/ocamorph/subtitles_136.hu.tok
bpgergo@krusovice:~/hunglish_tools/ocamorph/ocamorph/src/bindings/java $ java -Djava.library.path=./output/ -cp output mokk.nlp.ocamorph.FileStemmer $HULEXICON src/java/mokk/nlp/ocamorph/cache.txt > src/java/mokk/nlp/ocamorph/Tolkien_1.en.stem < src/java/mokk/nlp/ocamorph/Tolkien_1.en.tok
Segmentation fault
bpgergo@krusovice:~/hunglish_tools/ocamorph/ocamorph/src/bindings/java $ ls -l src/java/mokk/nlp/ocamorph/
total 2116
-rw-rw-r-- 1 bpgergo breka 8505 2009-09-22 13:53 cache2.txt
-rw-rw-r-- 1 bpgergo breka 65 2009-07-07 18:48 Compounds.java
drwxrwxr-x 2 bpgergo breka 4096 2009-09-22 13:54 CVS
-rw-rw-r-- 1 bpgergo breka 5888 2009-09-18 17:19 FileStemmer.java
-rw-rw-r-- 1 bpgergo breka 77 2009-07-07 18:48 Guess.java
-rw-rw-r-- 1 bpgergo breka 953 2009-08-31 18:58 IOcamorphStemmer.java
-rw-rw-r-- 1 bpgergo breka 5419 2009-08-31 18:58 OcamorphCachedStemmer.java
-rw-rw-r-- 1 bpgergo breka 2836 2009-08-03 16:00 OcamorphStemmer.java
-rw-rw-r-- 1 bpgergo breka 4612 2009-09-22 12:51 OcamorphWrapper.java
-rw-rw-r-- 1 bpgergo breka 6731 2009-09-22 13:53 subtitles_136.hu.stem
-rw-rw-r-- 1 bpgergo breka 7356 2009-09-20 21:12 subtitles_136.hu.tok
-rw-rw-r-- 1 bpgergo breka 2907 2009-09-18 17:22 Tester.java
-rw-rw-r-- 1 bpgergo breka 0 2009-09-22 13:53 Tolkien_1.en.stem
-rw-rw-r-- 1 bpgergo breka 1033059 2009-09-17 16:09 Tolkien_1.en.tok
-rw-rw-r-- 1 bpgergo breka 0 2009-09-22 13:14 Tolkien_1.hu.stem
-rw-rw-r-- 1 bpgergo breka 1041968 2009-09-17 16:09 Tolkien_1.hu.tok
bpgergo@krusovice:~/hunglish_tools/ocamorph/ocamorph/src/bindings/java $
这是 Java 绑定的 C 部分 (/ocamorph/ src/bindings/java/src/c/hunmorph_jnistub.c)。这可能是有问题的部分,感谢您提供任何提示或帮助查找错误:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "mokk_nlp_ocamorph_OcamorphWrapper.h"
#include "ocamorph.h"
#define MAX_ANALYSIS 100
#define ANALYSIS_MAXLEN 100
// initialize the analysis string
char analysis[ANALYSIS_MAXLEN];
// initialize input buffer
char buffer[500];
char* analyses[MAX_ANALYSIS];
jmethodID MID_InstanceMethodCall_callback;
JNIEXPORT void JNICALL Java_mokk_nlp_ocamorph_OcamorphWrapper_initIDs
(JNIEnv *env, jclass cls) {
MID_InstanceMethodCall_callback =
(*env)->GetMethodID(env, cls, "callback", "([B)V");
}
JNIEXPORT jlong JNICALL Java_mokk_nlp_ocamorph_OcamorphWrapper_init
(JNIEnv * env, jobject obj, jstring bin_arg) {
/* Convert to UTF8 */
const char *bin_file = (*env)->GetStringUTFChars(env, bin_arg, JNI_FALSE);
ocamorph_startup();
ocamorph_engine engine = init_from_bin(bin_file,0/*Don't pass the stupid no_caps argument*/);
/* Release created UTF8 string */
(*env)->ReleaseStringUTFChars(env, bin_arg, bin_file);
int i;
for (i=0; i<MAX_ANALYSIS;i++) {
analyses[i] = (char *) malloc(ANALYSIS_MAXLEN * sizeof(char));
};
return (jlong) engine;
}
JNIEXPORT jlong JNICALL Java_mokk_nlp_ocamorph_OcamorphWrapper_make_1analyzer
(JNIEnv *env, jobject obj, jlong engine , jint blocking, jint compunds, jint stop_at_first, jint guess) {
ocamorph_engine analyzer = make_analyzer((ocamorph_engine) engine, blocking, compunds, stop_at_first, guess);
return (jlong) analyzer;
}
JNIEXPORT void JNICALL Java_mokk_nlp_ocamorph_OcamorphWrapper_analyze
(JNIEnv * env, jobject obj, jlong analyzer, jbyteArray word) {
ocamorph_engine analyzerc = (ocamorph_engine) analyzer;
/* Convert to UTF8 */
// const char *wordc = (*env)->GetStringUTFChars(env, word, JNI_FALSE);
//char *wordc = (char *) (*env)->GetByteArrayElements( env, word, 0);
const int maxInputLength = 1000;
char wordc[maxInputLength];
jsize len = (*env)->GetArrayLength(env,word);
if (len>=maxInputLength) { len = maxInputLength-1; }
if (len!=0)
{
(*env)->GetByteArrayRegion(env,word,0,len,(jbyte*)wordc);
}
wordc[len] = '\0';
int n = analyze(analyzerc,wordc,analyses,MAX_ANALYSIS, ANALYSIS_MAXLEN);
int i;
for (i=0; i < n; ++i) {
// jstring ana = (*env)->NewStringUTF(env, analyses[i]);
char* ana = analyses[i];
jbyteArray jb=(*env)->NewByteArray(env, strlen(ana));
(*env)->SetByteArrayRegion(env, jb, 0, strlen(ana), (jbyte *)ana);
(*env)->CallVoidMethod(env, obj, MID_InstanceMethodCall_callback, jb);
}
// (*env)->ReleaseStringUTFChars(env, word, wordc);
}
这是 Java 部分(/ocamorph/src/bindings/java/src/java/mokk/nlp/ocamorph/OcamorphWrapper.java):
package mokk.nlp.ocamorph;
import java.io.UnsupportedEncodingException;
import java.util.LinkedList;
import java.util.List;
/**
* JNI interface for Ocamorph. Constructor loads ocamorph engine and a specified binary resource.
*
* @author bpgergo
*
*/
public class OcamorphWrapper {
private long analyzerId;
private long engineId;
private native static void initIDs();
private native long init(String bin);
// const ocamorph_engine engine, const int blocking, const int compounds,
// const int stop_at_first, const int guess
// valami hiba van az ocamorph-ban, mert a stop_at_first vezerli az
// osszetettszosagot
private native long make_analyzer(long engine, int blocking, int compounds,
int stop_at_first, int guess);
private native void analyze(long analyzer, byte[] word);
static {
//TODO FIXME how to define the library dynamically?
System.loadLibrary("ocamorph");
initIDs();
}
/**
* the encoding required by the ocamorph lib
*/
private static String encoding = "ISO-8859-2";
//private static boolean debug = false;
/**
* analyze result (the callback will add the result strings)
*/
private List<String> analyzeResult = null;
/**
* Loads a new Ocamorph engine, using the given binary resource and the arguments.
*
* @param bin
* @param blocking
* @param stopAtFirst
* @param compounds
* @param guess
*/
public OcamorphWrapper(String bin, boolean blocking, boolean stopAtFirst,
Compounds compounds, Guess guess) {
super();
engineId = init(bin);
int comp = compounds2Code(compounds);
int gu = guessToCode(guess);
analyzerId = make_analyzer(engineId, boolean2Code(blocking), boolean2Code(stopAtFirst),
comp, gu);
//debug("engineId:"+engineId);
//debug("analyzerId:"+analyzerId);
//debug = false;
}
/**
* This is the interface method for ocamorph analysis for the java side.
* @param ba
*/
public List<String> analyze(String word) {
//debug("analyze:");
analyzeResult = new LinkedList<String>();
byte[] ba = null;
try {
ba = word.getBytes(encoding);
} catch (UnsupportedEncodingException e1) {
System.err
.println("Ocamorph analyze UnsupportedEncodingException: ");
e1.printStackTrace();
}
if (ba != null){
//debug //printBytes(ba, "analizze:");
analyze(analyzerId, ba);
}
return analyzeResult;
}
/**
* The C interface will call this method to return analysis results
*/
private void callback(byte[] ana) {
String s = null;
try {
// bpgergo 20090618 this was a bug
// s = new String(ana);
s = new String(ana, encoding);
} catch (UnsupportedEncodingException e) {
System.err.println("callback new String(ana, encoding) UnsupportedEncodingException:");
e.printStackTrace();
}
analyzeResult.add(s);
//if (s != null) {
//debug("!callback recieved: ");
// debug //printBytes(ana, s);
//} else {
//debug("callback s == null");
//}
}
/* static argument conversion methods */
private static int boolean2Code(boolean bool){
if (bool){
return 1;
} else {
return 0;
}
}
private static int compounds2Code(Compounds compounds){
int comp = 0;
switch (compounds) {
case No:
comp = 0;
break;
case Allow:
comp = 1;
break;
}
return comp;
}
private static int guessToCode(Guess guess){
int gu = 0;
switch (guess) {
case NoGuess:
gu = 0;
break;
case Fallback:
gu = 1;
break;
case Global:
gu = 2;
break;
}
return gu;
}
public String getEncoding() {
return encoding;
}
public long getAnalyzerId() {
return analyzerId;
}
/*private static void debug(String string) {
if (debug) {
System.out.println(string);
}
}*/
/* getter/setter methods */
public boolean isDebug() {
return false; //debug;
}
public void setDebug(boolean debug) {
//OcamorphWrapper.debug = debug;
}
/* static debug methods */
/*public static void printBytes(byte[] array, String name) {
if (debug) {
for (int k = 0; k < array.length; k++) {
debug(name + "[" + k + "] = " + "0x" + byteToHex(array[k]));
}
}
}*/
/*static public String byteToHex(byte b) {
// Returns hex String representation of byte b
char hexDigit[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
'a', 'b', 'c', 'd', 'e', 'f' };
char[] array = { hexDigit[(b >> 4) & 0x0f], hexDigit[b & 0x0f] };
return new String(array);
}*/
/*static public String charToHex(char c) {
// Returns hex String representation of char c
byte hi = (byte) (c >>> 8);
byte lo = (byte) (c & 0xff);
return byteToHex(hi) + byteToHex(lo);
}*/
}
there is this morphological analyzer (open source, written in OCml) named ocamorph. download and make instructions here
The java binding is buggy and I'll have to fix it and after a few hours of struggle now it seems to me it'll take a few days to fix it as I'm not familiar with C, JNI, OCml and this particular software.
Here you can see that for a small file (subtitles_136.hu.tok) it works but for a larger file (Tolkien_1.hu.tok) "Segmentation error" is thrown:
bpgergo@krusovice:~/hunglish_tools/ocamorph/ocamorph/src/bindings/java $ java -Djava.library.path=./output/ -cp output mokk.nlp.ocamorph.FileStemmer $HULEXICON src/java/mokk/nlp/ocamorph/cache2.txt > src/java/mokk/nlp/ocamorph/subtitles_136.hu.stem < src/java/mokk/nlp/ocamorph/subtitles_136.hu.tok
bpgergo@krusovice:~/hunglish_tools/ocamorph/ocamorph/src/bindings/java $ java -Djava.library.path=./output/ -cp output mokk.nlp.ocamorph.FileStemmer $HULEXICON src/java/mokk/nlp/ocamorph/cache.txt > src/java/mokk/nlp/ocamorph/Tolkien_1.en.stem < src/java/mokk/nlp/ocamorph/Tolkien_1.en.tok
Segmentation fault
bpgergo@krusovice:~/hunglish_tools/ocamorph/ocamorph/src/bindings/java $ ls -l src/java/mokk/nlp/ocamorph/
total 2116
-rw-rw-r-- 1 bpgergo breka 8505 2009-09-22 13:53 cache2.txt
-rw-rw-r-- 1 bpgergo breka 65 2009-07-07 18:48 Compounds.java
drwxrwxr-x 2 bpgergo breka 4096 2009-09-22 13:54 CVS
-rw-rw-r-- 1 bpgergo breka 5888 2009-09-18 17:19 FileStemmer.java
-rw-rw-r-- 1 bpgergo breka 77 2009-07-07 18:48 Guess.java
-rw-rw-r-- 1 bpgergo breka 953 2009-08-31 18:58 IOcamorphStemmer.java
-rw-rw-r-- 1 bpgergo breka 5419 2009-08-31 18:58 OcamorphCachedStemmer.java
-rw-rw-r-- 1 bpgergo breka 2836 2009-08-03 16:00 OcamorphStemmer.java
-rw-rw-r-- 1 bpgergo breka 4612 2009-09-22 12:51 OcamorphWrapper.java
-rw-rw-r-- 1 bpgergo breka 6731 2009-09-22 13:53 subtitles_136.hu.stem
-rw-rw-r-- 1 bpgergo breka 7356 2009-09-20 21:12 subtitles_136.hu.tok
-rw-rw-r-- 1 bpgergo breka 2907 2009-09-18 17:22 Tester.java
-rw-rw-r-- 1 bpgergo breka 0 2009-09-22 13:53 Tolkien_1.en.stem
-rw-rw-r-- 1 bpgergo breka 1033059 2009-09-17 16:09 Tolkien_1.en.tok
-rw-rw-r-- 1 bpgergo breka 0 2009-09-22 13:14 Tolkien_1.hu.stem
-rw-rw-r-- 1 bpgergo breka 1041968 2009-09-17 16:09 Tolkien_1.hu.tok
bpgergo@krusovice:~/hunglish_tools/ocamorph/ocamorph/src/bindings/java $
This is the C part of the Java binding (/ocamorph/src/bindings/java/src/c/hunmorph_jnistub.c). This might be the buggy part, thanks for any hint or help for finding the bug:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "mokk_nlp_ocamorph_OcamorphWrapper.h"
#include "ocamorph.h"
#define MAX_ANALYSIS 100
#define ANALYSIS_MAXLEN 100
// initialize the analysis string
char analysis[ANALYSIS_MAXLEN];
// initialize input buffer
char buffer[500];
char* analyses[MAX_ANALYSIS];
jmethodID MID_InstanceMethodCall_callback;
JNIEXPORT void JNICALL Java_mokk_nlp_ocamorph_OcamorphWrapper_initIDs
(JNIEnv *env, jclass cls) {
MID_InstanceMethodCall_callback =
(*env)->GetMethodID(env, cls, "callback", "([B)V");
}
JNIEXPORT jlong JNICALL Java_mokk_nlp_ocamorph_OcamorphWrapper_init
(JNIEnv * env, jobject obj, jstring bin_arg) {
/* Convert to UTF8 */
const char *bin_file = (*env)->GetStringUTFChars(env, bin_arg, JNI_FALSE);
ocamorph_startup();
ocamorph_engine engine = init_from_bin(bin_file,0/*Don't pass the stupid no_caps argument*/);
/* Release created UTF8 string */
(*env)->ReleaseStringUTFChars(env, bin_arg, bin_file);
int i;
for (i=0; i<MAX_ANALYSIS;i++) {
analyses[i] = (char *) malloc(ANALYSIS_MAXLEN * sizeof(char));
};
return (jlong) engine;
}
JNIEXPORT jlong JNICALL Java_mokk_nlp_ocamorph_OcamorphWrapper_make_1analyzer
(JNIEnv *env, jobject obj, jlong engine , jint blocking, jint compunds, jint stop_at_first, jint guess) {
ocamorph_engine analyzer = make_analyzer((ocamorph_engine) engine, blocking, compunds, stop_at_first, guess);
return (jlong) analyzer;
}
JNIEXPORT void JNICALL Java_mokk_nlp_ocamorph_OcamorphWrapper_analyze
(JNIEnv * env, jobject obj, jlong analyzer, jbyteArray word) {
ocamorph_engine analyzerc = (ocamorph_engine) analyzer;
/* Convert to UTF8 */
// const char *wordc = (*env)->GetStringUTFChars(env, word, JNI_FALSE);
//char *wordc = (char *) (*env)->GetByteArrayElements( env, word, 0);
const int maxInputLength = 1000;
char wordc[maxInputLength];
jsize len = (*env)->GetArrayLength(env,word);
if (len>=maxInputLength) { len = maxInputLength-1; }
if (len!=0)
{
(*env)->GetByteArrayRegion(env,word,0,len,(jbyte*)wordc);
}
wordc[len] = '\0';
int n = analyze(analyzerc,wordc,analyses,MAX_ANALYSIS, ANALYSIS_MAXLEN);
int i;
for (i=0; i < n; ++i) {
// jstring ana = (*env)->NewStringUTF(env, analyses[i]);
char* ana = analyses[i];
jbyteArray jb=(*env)->NewByteArray(env, strlen(ana));
(*env)->SetByteArrayRegion(env, jb, 0, strlen(ana), (jbyte *)ana);
(*env)->CallVoidMethod(env, obj, MID_InstanceMethodCall_callback, jb);
}
// (*env)->ReleaseStringUTFChars(env, word, wordc);
}
And here is the Java part (/ocamorph/src/bindings/java/src/java/mokk/nlp/ocamorph/OcamorphWrapper.java):
package mokk.nlp.ocamorph;
import java.io.UnsupportedEncodingException;
import java.util.LinkedList;
import java.util.List;
/**
* JNI interface for Ocamorph. Constructor loads ocamorph engine and a specified binary resource.
*
* @author bpgergo
*
*/
public class OcamorphWrapper {
private long analyzerId;
private long engineId;
private native static void initIDs();
private native long init(String bin);
// const ocamorph_engine engine, const int blocking, const int compounds,
// const int stop_at_first, const int guess
// valami hiba van az ocamorph-ban, mert a stop_at_first vezerli az
// osszetettszosagot
private native long make_analyzer(long engine, int blocking, int compounds,
int stop_at_first, int guess);
private native void analyze(long analyzer, byte[] word);
static {
//TODO FIXME how to define the library dynamically?
System.loadLibrary("ocamorph");
initIDs();
}
/**
* the encoding required by the ocamorph lib
*/
private static String encoding = "ISO-8859-2";
//private static boolean debug = false;
/**
* analyze result (the callback will add the result strings)
*/
private List<String> analyzeResult = null;
/**
* Loads a new Ocamorph engine, using the given binary resource and the arguments.
*
* @param bin
* @param blocking
* @param stopAtFirst
* @param compounds
* @param guess
*/
public OcamorphWrapper(String bin, boolean blocking, boolean stopAtFirst,
Compounds compounds, Guess guess) {
super();
engineId = init(bin);
int comp = compounds2Code(compounds);
int gu = guessToCode(guess);
analyzerId = make_analyzer(engineId, boolean2Code(blocking), boolean2Code(stopAtFirst),
comp, gu);
//debug("engineId:"+engineId);
//debug("analyzerId:"+analyzerId);
//debug = false;
}
/**
* This is the interface method for ocamorph analysis for the java side.
* @param ba
*/
public List<String> analyze(String word) {
//debug("analyze:");
analyzeResult = new LinkedList<String>();
byte[] ba = null;
try {
ba = word.getBytes(encoding);
} catch (UnsupportedEncodingException e1) {
System.err
.println("Ocamorph analyze UnsupportedEncodingException: ");
e1.printStackTrace();
}
if (ba != null){
//debug //printBytes(ba, "analizze:");
analyze(analyzerId, ba);
}
return analyzeResult;
}
/**
* The C interface will call this method to return analysis results
*/
private void callback(byte[] ana) {
String s = null;
try {
// bpgergo 20090618 this was a bug
// s = new String(ana);
s = new String(ana, encoding);
} catch (UnsupportedEncodingException e) {
System.err.println("callback new String(ana, encoding) UnsupportedEncodingException:");
e.printStackTrace();
}
analyzeResult.add(s);
//if (s != null) {
//debug("!callback recieved: ");
// debug //printBytes(ana, s);
//} else {
//debug("callback s == null");
//}
}
/* static argument conversion methods */
private static int boolean2Code(boolean bool){
if (bool){
return 1;
} else {
return 0;
}
}
private static int compounds2Code(Compounds compounds){
int comp = 0;
switch (compounds) {
case No:
comp = 0;
break;
case Allow:
comp = 1;
break;
}
return comp;
}
private static int guessToCode(Guess guess){
int gu = 0;
switch (guess) {
case NoGuess:
gu = 0;
break;
case Fallback:
gu = 1;
break;
case Global:
gu = 2;
break;
}
return gu;
}
public String getEncoding() {
return encoding;
}
public long getAnalyzerId() {
return analyzerId;
}
/*private static void debug(String string) {
if (debug) {
System.out.println(string);
}
}*/
/* getter/setter methods */
public boolean isDebug() {
return false; //debug;
}
public void setDebug(boolean debug) {
//OcamorphWrapper.debug = debug;
}
/* static debug methods */
/*public static void printBytes(byte[] array, String name) {
if (debug) {
for (int k = 0; k < array.length; k++) {
debug(name + "[" + k + "] = " + "0x" + byteToHex(array[k]));
}
}
}*/
/*static public String byteToHex(byte b) {
// Returns hex String representation of byte b
char hexDigit[] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
'a', 'b', 'c', 'd', 'e', 'f' };
char[] array = { hexDigit[(b >> 4) & 0x0f], hexDigit[b & 0x0f] };
return new String(array);
}*/
/*static public String charToHex(char c) {
// Returns hex String representation of char c
byte hi = (byte) (c >>> 8);
byte lo = (byte) (c & 0xff);
return byteToHex(hi) + byteToHex(lo);
}*/
}
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(2)
系统崩溃时是否会创建任何类型的 hs_pid###.log 文件?他们偶尔可以帮助解决这些问题。
我的猜测是,这与设置 MID_InstanceMethodCall_callback 方法 id 的古怪方式有关。 id 存储为全局值,并且只有在调用 initIDs 静态方法时才会设置它,这在示例代码中似乎不会发生。如果未设置,则analyze在尝试调用回调方法时将会呕吐。确保获得回调方法 id 的方法如下:
Is any sort of hs_pid###.log file being created when the system crashes? They can occasionally help in figuring out these problems.
My guess is that it has something to do with the wacky way that the MID_InstanceMethodCall_callback method id is being set. The id is stored as a global value, and it only gets set if the initIDs static method is called, which doesn't appear to happen in your sample code. If it's not set, then analyse will barf when it tries to call the callback method. A way to ensure that you get the callback method id would be the following:
尝试使用调试信息构建 C 代码,并了解如何在您的(看似类似 Unix 的)操作系统上启用核心转储。这应该给你一个起点。
Try building the C code with debug information, and look up how to enable core dumps on your (seemingly Unix-like) operating system. That should give you a starting point.