矩阵加窗的 Java 代码优化计算时间更长
我有一个代表图像的矩阵,我需要循环每个像素,对于每个像素,我必须计算其所有邻居<的总和< /strong>,即属于以该像素为中心半径为rad
的窗口的像素。
我提出了三种替代方案:
- 最简单的方法,即为每个像素重新计算窗口的方法
- 更优化的方法,使用队列来存储窗口列的总和并循环遍历矩阵的列,通过添加新元素并删除旧元素
- 更优化的方法,不需要重新计算每一行的队列,而是增量调整以前保存的元素,
我使用队列在 c++ 中实现了它们> 对于第二种方法和 deques 的组合对于第三种方法(我需要迭代它们的元素而不破坏它们)并对它们的时间进行评分,看看是否有实际的改进。看来第三种方法确实更快。
然后我尝试将代码移植到 Java(我必须承认我对此不太满意)。我对第二种方法使用了 ArrayDeque,对第三种方法使用了 LinkedLists,导致第三种方法在时间上效率低下。
这是 C++ 中最简单的方法(我没有发布 java 版本,因为它几乎相同):
void normalWindowing(int mat[][MAX], int cols, int rows, int rad){
int i, j;
int h = 0;
for (i = 0; i < rows; ++i)
{
for (j = 0; j < cols; j++)
{
h = 0;
for (int ry =- rad; ry <= rad; ry++)
{
int y = i + ry;
if (y >= 0 && y < rows)
{
for (int rx =- rad; rx <= rad; rx++)
{
int x = j + rx;
if (x >= 0 && x < cols)
{
h += mat[y][x];
}
}
}
}
}
}
}
这是 C++ 中的第二种方法(通过列优化的方法):
void opt1Windowing(int mat[][MAX], int cols, int rows, int rad){
int i, j, h, y, col;
queue<int>* q = NULL;
for (i = 0; i < rows; ++i)
{
if (q != NULL)
delete(q);
q = new queue<int>();
h = 0;
for (int rx = 0; rx <= rad; rx++)
{
if (rx < cols)
{
int mem = 0;
for (int ry =- rad; ry <= rad; ry++)
{
y = i + ry;
if (y >= 0 && y < rows)
{
mem += mat[y][rx];
}
}
q->push(mem);
h += mem;
}
}
for (j = 1; j < cols; j++)
{
col = j + rad;
if (j - rad > 0)
{
h -= q->front();
q->pop();
}
if (j + rad < cols)
{
int mem = 0;
for (int ry =- rad; ry <= rad; ry++)
{
y = i + ry;
if (y >= 0 && y < rows)
{
mem += mat[y][col];
}
}
q->push(mem);
h += mem;
}
}
}
}
这是 Java 版本:
public static void opt1Windowing(int [][] mat, int rad){
int i, j = 0, h, y, col;
int cols = mat[0].length;
int rows = mat.length;
ArrayDeque<Integer> q = null;
for (i = 0; i < rows; ++i)
{
q = new ArrayDeque<Integer>();
h = 0;
for (int rx = 0; rx <= rad; rx++)
{
if (rx < cols)
{
int mem = 0;
for (int ry =- rad; ry <= rad; ry++)
{
y = i + ry;
if (y >= 0 && y < rows)
{
mem += mat[y][rx];
}
}
q.addLast(mem);
h += mem;
}
}
j = 0;
for (j = 1; j < cols; j++)
{
col = j + rad;
if (j - rad > 0)
{
h -= q.peekFirst();
q.pop();
}
if (j + rad < cols)
{
int mem = 0;
for (int ry =- rad; ry <= rad; ry++)
{
y = i + ry;
if (y >= 0 && y < rows)
{
mem += mat[y][col];
}
}
q.addLast(mem);
h += mem;
}
}
}
}
我认识到这篇文章将成为一面文字墙。这是 C++ 中的第三种方法:
void opt2Windowing(int mat[][MAX], int cols, int rows, int rad){
int i = 0;
int j = 0;
int h = 0;
int hh = 0;
deque< deque<int> *> * M = new deque< deque<int> *>();
for (int ry = 0; ry <= rad; ry++)
{
if (ry < rows)
{
deque<int> * q = new deque<int>();
M->push_back(q);
for (int rx = 0; rx <= rad; rx++)
{
if (rx < cols)
{
int val = mat[ry][rx];
q->push_back(val);
h += val;
}
}
}
}
deque<int> * C = new deque<int>(M->front()->size());
deque<int> * Q = new deque<int>(M->front()->size());
deque<int> * R = new deque<int>(M->size());
deque< deque<int> *>::iterator mit;
deque< deque<int> *>::iterator mstart = M->begin();
deque< deque<int> *>::iterator mend = M->end();
deque<int>::iterator rit;
deque<int>::iterator rstart = R->begin();
deque<int>::iterator rend = R->end();
deque<int>::iterator cit;
deque<int>::iterator cstart = C->begin();
deque<int>::iterator cend = C->end();
for (mit = mstart, rit = rstart; mit != mend, rit != rend; ++mit, ++rit)
{
deque<int>::iterator pit;
deque<int>::iterator pstart = (* mit)->begin();
deque<int>::iterator pend = (* mit)->end();
for(cit = cstart, pit = pstart; cit != cend && pit != pend; ++cit, ++pit)
{
(* cit) += (* pit);
(* rit) += (* pit);
}
}
for (i = 0; i < rows; ++i)
{
j = 0;
if (i - rad > 0)
{
deque<int>::iterator cit;
deque<int>::iterator cstart = C->begin();
deque<int>::iterator cend = C->end();
deque<int>::iterator pit;
deque<int>::iterator pstart = (M->front())->begin();
deque<int>::iterator pend = (M->front())->end();
for(cit = cstart, pit = pstart; cit != cend; ++cit, ++pit)
{
(* cit) -= (* pit);
}
deque<int> * k = M->front();
M->pop_front();
delete k;
h -= R->front();
R->pop_front();
}
int row = i + rad;
if (row < rows && i > 0)
{
deque<int> * newQ = new deque<int>();
M->push_back(newQ);
deque<int>::iterator cit;
deque<int>::iterator cstart = C->begin();
deque<int>::iterator cend = C->end();
int rx;
int tot = 0;
for (rx = 0, cit = cstart; rx <= rad; rx++, ++cit)
{
if (rx < cols)
{
int val = mat[row][rx];
newQ->push_back(val);
(* cit) += val;
tot += val;
}
}
R->push_back(tot);
h += tot;
}
hh = h;
copy(C->begin(), C->end(), Q->begin());
for (j = 1; j < cols; j++)
{
int col = j + rad;
if (j - rad > 0)
{
hh -= Q->front();
Q->pop_front();
}
if (j + rad < cols)
{
int val = 0;
for (int ry =- rad; ry <= rad; ry++)
{
int y = i + ry;
if (y >= 0 && y < rows)
{
val += mat[y][col];
}
}
hh += val;
Q->push_back(val);
}
}
}
}
最后是它的 Java 版本:
public static void opt2Windowing(int [][] mat, int rad){
int cols = mat[0].length;
int rows = mat.length;
int i = 0;
int j = 0;
int h = 0;
int hh = 0;
LinkedList<LinkedList<Integer>> M = new LinkedList<LinkedList<Integer>>();
for (int ry = 0; ry <= rad; ry++)
{
if (ry < rows)
{
LinkedList<Integer> q = new LinkedList<Integer>();
M.addLast(q);
for (int rx = 0; rx <= rad; rx++)
{
if (rx < cols)
{
int val = mat[ry][rx];
q.addLast(val);
h += val;
}
}
}
}
int firstSize = M.getFirst().size();
int mSize = M.size();
LinkedList<Integer> C = new LinkedList<Integer>();
LinkedList<Integer> Q = null;
LinkedList<Integer> R = new LinkedList<Integer>();
for (int k = 0; k < firstSize; k++)
{
C.add(0);
}
for (int k = 0; k < mSize; k++)
{
R.add(0);
}
ListIterator<LinkedList<Integer>> mit;
ListIterator<Integer> rit;
ListIterator<Integer> cit;
ListIterator<Integer> pit;
for (mit = M.listIterator(), rit = R.listIterator(); mit.hasNext();)
{
Integer r = rit.next();
int rsum = 0;
for (cit = C.listIterator(), pit = (mit.next()).listIterator();
cit.hasNext();)
{
Integer c = cit.next();
Integer p = pit.next();
rsum += p;
cit.set(c + p);
}
rit.set(r + rsum);
}
for (i = 0; i < rows; ++i)
{
j = 0;
if (i - rad > 0)
{
for(cit = C.listIterator(), pit = M.getFirst().listIterator();
cit.hasNext();)
{
Integer c = cit.next();
Integer p = pit.next();
cit.set(c - p);
}
M.removeFirst();
h -= R.getFirst();
R.removeFirst();
}
int row = i + rad;
if (row < rows && i > 0)
{
LinkedList<Integer> newQ = new LinkedList<Integer>();
M.addLast(newQ);
int rx;
int tot = 0;
for (rx = 0, cit = C.listIterator(); rx <= rad; rx++)
{
if (rx < cols)
{
Integer c = cit.next();
int val = mat[row][rx];
newQ.addLast(val);
cit.set(c + val);
tot += val;
}
}
R.addLast(tot);
h += tot;
}
hh = h;
Q = new LinkedList<Integer>();
Q.addAll(C);
for (j = 1; j < cols; j++)
{
int col = j + rad;
if (j - rad > 0)
{
hh -= Q.getFirst();
Q.pop();
}
if (j + rad < cols)
{
int val = 0;
for (int ry =- rad; ry <= rad; ry++)
{
int y = i + ry;
if (y >= 0 && y < rows)
{
val += mat[y][col];
}
}
hh += val;
Q.addLast(val);
}
}
}
}
我想这主要是由于 Java 中 LinkedList 的选择不当以及两个 LinkedList 之间缺乏有效(非浅)的复制方法。
如何改进第三种Java方法?我是否犯了一些概念错误?一如既往,欢迎任何批评。
更新即使它不能解决问题,按照建议使用ArrayLists
,而不是LinkedList
改进了第三种方法。第二种方法的性能仍然更好(但是当矩阵的行数和列数低于300且窗口半径较小时,第一种未优化的方法在Java中是最快的)
UPDATE2 我可以使用什么工具用于分析我的代码并更深入地了解哪条指令花费最多时间?我使用的是 Mac OS X,使用 NetBeans Profiler 只是向我展示了这三种方法的最终时间不同(看来我无法确定每种方法的范围)
UPDATE3 我正在评分在 java 中使用 System.nanoTime() 计算时间这会导致估计不准确吗?:
long start, end;
start = System.nanoTime();
simpleWindowing(mat, rad);
end = System.nanoTime();
System.out.println(end-start);
start = System.nanoTime();
opt1Windowing(mat, rad);
end = System.nanoTime();
System.out.println(end-start);
start = System.nanoTime();
opt2Windowing(mat, rad);
end = System.nanoTime();
System.out.println(end-start);
I have a matrix which represents an image and I need to cycle over each pixel and for each one of those I have to compute the sum of all its neighbors, ie the pixels that belong to a window of radius rad
centered on the pixel.
I came up with three alternatives:
- The simplest way, the one that recomputes the window for each pixel
- The more optimized way that uses a queue to store the sums of the window columns and cycling through the columns of the matrix updates this queue by adding a new element and removing the oldes
- The even more optimized way that does not need to recompute the queue for each row but incrementally adjusts a previously saved one
I implemented them in c++ using a queue for the second method and a combination of deques for the third (I need to iterate through their elements without destructing them) and scored their times to see if there was an actual improvement. it appears that the third method is indeed faster.
Then I tried to port the code to Java (and I must admit that I'm not very comfortable with it). I used ArrayDeque for the second method and LinkedLists for the third resulting in the third being inefficient in time.
Here is the simplest method in C++ (I'm not posting the java version since it is almost identical):
void normalWindowing(int mat[][MAX], int cols, int rows, int rad){
int i, j;
int h = 0;
for (i = 0; i < rows; ++i)
{
for (j = 0; j < cols; j++)
{
h = 0;
for (int ry =- rad; ry <= rad; ry++)
{
int y = i + ry;
if (y >= 0 && y < rows)
{
for (int rx =- rad; rx <= rad; rx++)
{
int x = j + rx;
if (x >= 0 && x < cols)
{
h += mat[y][x];
}
}
}
}
}
}
}
Here is the second method (the one optimized through columns) in C++:
void opt1Windowing(int mat[][MAX], int cols, int rows, int rad){
int i, j, h, y, col;
queue<int>* q = NULL;
for (i = 0; i < rows; ++i)
{
if (q != NULL)
delete(q);
q = new queue<int>();
h = 0;
for (int rx = 0; rx <= rad; rx++)
{
if (rx < cols)
{
int mem = 0;
for (int ry =- rad; ry <= rad; ry++)
{
y = i + ry;
if (y >= 0 && y < rows)
{
mem += mat[y][rx];
}
}
q->push(mem);
h += mem;
}
}
for (j = 1; j < cols; j++)
{
col = j + rad;
if (j - rad > 0)
{
h -= q->front();
q->pop();
}
if (j + rad < cols)
{
int mem = 0;
for (int ry =- rad; ry <= rad; ry++)
{
y = i + ry;
if (y >= 0 && y < rows)
{
mem += mat[y][col];
}
}
q->push(mem);
h += mem;
}
}
}
}
And here is the Java version:
public static void opt1Windowing(int [][] mat, int rad){
int i, j = 0, h, y, col;
int cols = mat[0].length;
int rows = mat.length;
ArrayDeque<Integer> q = null;
for (i = 0; i < rows; ++i)
{
q = new ArrayDeque<Integer>();
h = 0;
for (int rx = 0; rx <= rad; rx++)
{
if (rx < cols)
{
int mem = 0;
for (int ry =- rad; ry <= rad; ry++)
{
y = i + ry;
if (y >= 0 && y < rows)
{
mem += mat[y][rx];
}
}
q.addLast(mem);
h += mem;
}
}
j = 0;
for (j = 1; j < cols; j++)
{
col = j + rad;
if (j - rad > 0)
{
h -= q.peekFirst();
q.pop();
}
if (j + rad < cols)
{
int mem = 0;
for (int ry =- rad; ry <= rad; ry++)
{
y = i + ry;
if (y >= 0 && y < rows)
{
mem += mat[y][col];
}
}
q.addLast(mem);
h += mem;
}
}
}
}
I recognize this post will be a wall of text. Here is the third method in C++:
void opt2Windowing(int mat[][MAX], int cols, int rows, int rad){
int i = 0;
int j = 0;
int h = 0;
int hh = 0;
deque< deque<int> *> * M = new deque< deque<int> *>();
for (int ry = 0; ry <= rad; ry++)
{
if (ry < rows)
{
deque<int> * q = new deque<int>();
M->push_back(q);
for (int rx = 0; rx <= rad; rx++)
{
if (rx < cols)
{
int val = mat[ry][rx];
q->push_back(val);
h += val;
}
}
}
}
deque<int> * C = new deque<int>(M->front()->size());
deque<int> * Q = new deque<int>(M->front()->size());
deque<int> * R = new deque<int>(M->size());
deque< deque<int> *>::iterator mit;
deque< deque<int> *>::iterator mstart = M->begin();
deque< deque<int> *>::iterator mend = M->end();
deque<int>::iterator rit;
deque<int>::iterator rstart = R->begin();
deque<int>::iterator rend = R->end();
deque<int>::iterator cit;
deque<int>::iterator cstart = C->begin();
deque<int>::iterator cend = C->end();
for (mit = mstart, rit = rstart; mit != mend, rit != rend; ++mit, ++rit)
{
deque<int>::iterator pit;
deque<int>::iterator pstart = (* mit)->begin();
deque<int>::iterator pend = (* mit)->end();
for(cit = cstart, pit = pstart; cit != cend && pit != pend; ++cit, ++pit)
{
(* cit) += (* pit);
(* rit) += (* pit);
}
}
for (i = 0; i < rows; ++i)
{
j = 0;
if (i - rad > 0)
{
deque<int>::iterator cit;
deque<int>::iterator cstart = C->begin();
deque<int>::iterator cend = C->end();
deque<int>::iterator pit;
deque<int>::iterator pstart = (M->front())->begin();
deque<int>::iterator pend = (M->front())->end();
for(cit = cstart, pit = pstart; cit != cend; ++cit, ++pit)
{
(* cit) -= (* pit);
}
deque<int> * k = M->front();
M->pop_front();
delete k;
h -= R->front();
R->pop_front();
}
int row = i + rad;
if (row < rows && i > 0)
{
deque<int> * newQ = new deque<int>();
M->push_back(newQ);
deque<int>::iterator cit;
deque<int>::iterator cstart = C->begin();
deque<int>::iterator cend = C->end();
int rx;
int tot = 0;
for (rx = 0, cit = cstart; rx <= rad; rx++, ++cit)
{
if (rx < cols)
{
int val = mat[row][rx];
newQ->push_back(val);
(* cit) += val;
tot += val;
}
}
R->push_back(tot);
h += tot;
}
hh = h;
copy(C->begin(), C->end(), Q->begin());
for (j = 1; j < cols; j++)
{
int col = j + rad;
if (j - rad > 0)
{
hh -= Q->front();
Q->pop_front();
}
if (j + rad < cols)
{
int val = 0;
for (int ry =- rad; ry <= rad; ry++)
{
int y = i + ry;
if (y >= 0 && y < rows)
{
val += mat[y][col];
}
}
hh += val;
Q->push_back(val);
}
}
}
}
And finally its Java version:
public static void opt2Windowing(int [][] mat, int rad){
int cols = mat[0].length;
int rows = mat.length;
int i = 0;
int j = 0;
int h = 0;
int hh = 0;
LinkedList<LinkedList<Integer>> M = new LinkedList<LinkedList<Integer>>();
for (int ry = 0; ry <= rad; ry++)
{
if (ry < rows)
{
LinkedList<Integer> q = new LinkedList<Integer>();
M.addLast(q);
for (int rx = 0; rx <= rad; rx++)
{
if (rx < cols)
{
int val = mat[ry][rx];
q.addLast(val);
h += val;
}
}
}
}
int firstSize = M.getFirst().size();
int mSize = M.size();
LinkedList<Integer> C = new LinkedList<Integer>();
LinkedList<Integer> Q = null;
LinkedList<Integer> R = new LinkedList<Integer>();
for (int k = 0; k < firstSize; k++)
{
C.add(0);
}
for (int k = 0; k < mSize; k++)
{
R.add(0);
}
ListIterator<LinkedList<Integer>> mit;
ListIterator<Integer> rit;
ListIterator<Integer> cit;
ListIterator<Integer> pit;
for (mit = M.listIterator(), rit = R.listIterator(); mit.hasNext();)
{
Integer r = rit.next();
int rsum = 0;
for (cit = C.listIterator(), pit = (mit.next()).listIterator();
cit.hasNext();)
{
Integer c = cit.next();
Integer p = pit.next();
rsum += p;
cit.set(c + p);
}
rit.set(r + rsum);
}
for (i = 0; i < rows; ++i)
{
j = 0;
if (i - rad > 0)
{
for(cit = C.listIterator(), pit = M.getFirst().listIterator();
cit.hasNext();)
{
Integer c = cit.next();
Integer p = pit.next();
cit.set(c - p);
}
M.removeFirst();
h -= R.getFirst();
R.removeFirst();
}
int row = i + rad;
if (row < rows && i > 0)
{
LinkedList<Integer> newQ = new LinkedList<Integer>();
M.addLast(newQ);
int rx;
int tot = 0;
for (rx = 0, cit = C.listIterator(); rx <= rad; rx++)
{
if (rx < cols)
{
Integer c = cit.next();
int val = mat[row][rx];
newQ.addLast(val);
cit.set(c + val);
tot += val;
}
}
R.addLast(tot);
h += tot;
}
hh = h;
Q = new LinkedList<Integer>();
Q.addAll(C);
for (j = 1; j < cols; j++)
{
int col = j + rad;
if (j - rad > 0)
{
hh -= Q.getFirst();
Q.pop();
}
if (j + rad < cols)
{
int val = 0;
for (int ry =- rad; ry <= rad; ry++)
{
int y = i + ry;
if (y >= 0 && y < rows)
{
val += mat[y][col];
}
}
hh += val;
Q.addLast(val);
}
}
}
}
I guess that most is due to the poor choice of the LinkedList in Java and to the lack of an efficient (not shallow) copy method between two LinkedList.
How can I improve the third Java method? Am I doing some conceptual error? As always, any criticisms is welcome.
UPDATE Even if it does not solve the issue, using ArrayLists
, as being suggested, instead of LinkedList
improves the third method. The second one performs still better (but when the number of rows and columns of the matrix is lower than 300 and the window radius is small the first unoptimized method is the fastest in Java)
UPDATE2 Which tool can I use to profile my code and have a richer understanding of which instruction takes the most time? I'm on Mac OS X and using NetBeans Profiler just shows me that the three methods end up with different times (It seems I'm not able to scope within each method)
UPDATE3 I'm scoring the times in java using System.nanoTime()
can this lead to inaccurate estimates?:
long start, end;
start = System.nanoTime();
simpleWindowing(mat, rad);
end = System.nanoTime();
System.out.println(end-start);
start = System.nanoTime();
opt1Windowing(mat, rad);
end = System.nanoTime();
System.out.println(end-start);
start = System.nanoTime();
opt2Windowing(mat, rad);
end = System.nanoTime();
System.out.println(end-start);
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(4)
对于进行随机访问的列表来说,LinkedList 是一个非常糟糕的选择。
对于每个
get(int)
都会扫描列表,直到到达请求索引。get(1)
相当快,但是get(100)
比 get 慢 100 倍,而get(1000)
比 get 慢 1000 倍(1)您应该更改为使用 ArrayList,并使用预期大小初始化 ArrayList,以避免不必要地调整内部数组的大小。
编辑
虽然我对 get() 和 LinkedList 的评论是正确的,但它们不适用于这种情况。我不知何故忽略了列表中没有随机访问。
LinkedList is a very bad choice for a list with where you do random access.
For each
get(int)
scans the list until the request index is reached.get(1)
is quite fast, butget(100)
is 100 times slower andget(1000)
is a 1000 times slower than get(1)You should change that to use ArrayList instead and initialize the ArrayList with the expected size to avoid unnecessary resizing of the internal Array.
Edit
While my coments about get() and LinkedList are correct, they do not apply in this context. I somehow overlooked that there is no random access to the list.
使用
int[]
而不是List
。Lists
存储需要从int
到Integer
相互转换的对象。Use an
int[]
instead of aList
.Lists
store Objects requiring a conversion fromint
toInteger
and back.我确实为该例程实现了两个优化版本:
int
数组作为队列来缓存求和的列值,因为算法按列扫描根据实施该技术的特定领域,一种技术可以比另一种技术任意快。在我的例子中,求和面积表必须计算多次,因此对于半径值小于 20 像素的情况,它比第一种方法慢。
I indeed implemented two optimized versions for that routine:
int
as a queue to cache the summed column values as the algorithm scans the image by columnsOne technique can be arbitrary faster than the other according to the specific domain in which it is implemented. In mine, the summed area table had to be computed several times and so it resulted slower than the first method for a radius value lesser than 20 pixel.
关于为您的代码计时:System.nanoTime() 没问题(据我所知,我认为您不会变得更好,因为它使用操作系统计时器),但是:
不要尝试如果测量的任务太短,那么准确性就不太好。我认为任何少于几毫秒的事情都会给你带来潜在的麻烦。参考文献,有人吗?
多次测量,并取测量值的中位数。外部影响会严重减慢执行速度,使您的估计毫无用处。取均值效果不太好,因为它对此类异常值很敏感。
许多 JVM 都有 JIT 编译器,您可能希望在测量之前多次执行代码,因此编译器不会在测量中间的某个位置启动,并且一半的测量突然比其余测量快 10 倍。在虚拟机“预热”之后进行更好的测量。
About timing your code: System.nanoTime() is ok (i don't think you can get better since it's using OS timers as far as i know), but:
don't try to measure too short of a task, then the accuracy isnt so good. I think anything less than a few milliseconds will give you potential trouble. References, anyone?
measure multiple times, and take the median of the measurements. Outside effects can severely slow the execution, making your estimation useless. Taking the mean doesn't work too well because it is sensitive to such outliers.
many JVMs have JIT compiler, you may want to execute your code multiple times before you measure, so the compiler doesn't kick in somewhere in the middle of your measurement and half of your measurements are suddenly 10x faster than the rest. Better measure after your VM has "warmed up".