Node.js 抓取工具中的内存泄漏

发布于 2024-11-02 14:00:04 字数 5817 浏览 1 评论 0原文

这是一个用 JavaScript 和 Node.js 编写的简单抓取工具，用于抓取 Wikipedia 中的元素周期表元素数据。依赖项是用于 DOM 操作的 jsdom 和 chain-gang 用于排队。

大多数时候它工作得很好（它不能优雅地处理错误），而且代码也不算太糟糕，我敢说 for 尝试，但它有一个严重的错误 - 它在任何地方都会可怕地泄漏内存每个元素占用计算机内存的 0.3% 到 0.6%，这样当它领先时，它会使用接近 20% 的内存，这显然是不可接受的。

我尝试过使用分析器，但我发现它们要么没有帮助，要么难以解释数据。我怀疑这与 processElement 的传递方式有关，但我很难将队列代码重写为更优雅的代码。

var fs = require('fs'),
    path = require('path'),
    jsdom = require("jsdom"),
    parseUrl = require('url').parse,
    chainGang = require('chain-gang');

var chain = chainGang.create({
    workers: 1
});

var Settings = {
    periodicUrl: 'http://en.wikipedia.org/wiki/Template:Periodic_table',
    periodicSelector: '#bodyContent > table:first',
    pathPrefix: 'data/',
    ignoredProperties: ['Pronunciation']
};

function writeToFile(output) {
    var keys = 0;

    // Huge nests for finding the name of the element... yeah
    for(var i in output) {
        if(typeof output[i] === 'object' && output[i] !== null){
            for(var l in output[i]) {
                if(l.toLowerCase() === 'name') {
                    var name = output[i][l];
                }
            }

            keys += Object.keys(output[i]).length;
        }
    }

    console.log('Scraped ' + keys + ' properties for ' + name);
    console.log('Writing to ' + Settings.pathPrefix + name + '.json');
    fs.writeFile(Settings.pathPrefix + name + '.json', JSON.stringify(output));
}

// Generic create task function to create a task function that
// would be passed to the chain gang
function createTask (url, callback) {
    console.log('Task added - ' + url);

    return function(worker){
        console.log('Requesting: ' +url);

        jsdom.env(url, [
            'jquery.min.js' // Local copy of jQuery
        ], function(errors, window) {
            if(errors){
                console.log('Error! ' + errors)
                createTask(url, callback);
            } else {
                // Give me thy $
                var $ = window.$;

                // Cleanup - remove unneeded elements
                $.fn.cleanup = function() {
                    return this.each(function(){
                        $(this).find('sup.reference, .IPA').remove().end()
                            .find('a, b, i, small, span').replaceWith(function(){
                                return this.innerHTML;
                            }).end()
                            .find('br').replaceWith(' ');
                    });
                }

                callback($);
            }

            worker.finish();
        });
    }
}

function processElement ($){
    var infoBox = $('.infobox'),
        image = infoBox.find('tr:contains("Appearance") + tr img:first'),
        description = $('#toc').prevAll('p').cleanup(),
        headers = infoBox.find('tr:contains("properties")'),
        output = {
            Appearance: image.attr('src'),
            Description: $('.infobox + p').cleanup().html()
        };

    headers.each(function(){
        var that = this,
            title = this.textContent.trim(),
            rowspan = 0,
            rowspanHeading = '';

        output[title] = {};

        $(this).nextUntil('tr:has(th:only-child)').each(function(){
            var t = $(this).cleanup(),
                headingEle = t.children('th'),
                data = t.children('td').html().trim();

            if(headingEle.length) {
                var heading = headingEle.html().trim();
            }

            // Skip to next heading if current property is ignored
            if(~Settings.ignoredProperties.indexOf(heading)) {
                return true;
            }

            if (rowspan) {
                output[title][rowspanHeading][data.split(':')[0].trim()] = data.split(':')[1].trim();
                rowspan--;
            } else if (headingEle.attr('rowspan')){
                rowspan = headingEle.attr('rowspan') - 1;
                rowspanHeading = heading;

                output[title][heading] = {};
                output[title][heading][data.split(':')[0]] = data.split(':')[1];
            } else if (~heading.indexOf(',')){
                data = data.split(',');

                heading.split(',').forEach(function(v, i){
                    output[title][v.trim()] = data[i].trim();
                });
            } else {
                output[title][heading] = data;
            }
        });
    });

    writeToFile(output);
}

function fetchElements(elements) {
    elements.forEach(function(value){
        // Element URL used here as task id (second argument)
        chain.add(createTask(value, processElement), value);
    });
}

function processTable($){
    var elementArray = $(Settings.periodicSelector).find('td').map(function(){
        var t = $(this),
            atomicN = parseInt(t.text(), 10);

        if(atomicN && t.children('a').length) {
            var elementUrl = 'http://' + parseUrl(Settings.periodicUrl).host + t.children('a:first').attr('href');

            console.log(atomicN, t.children('a:first').attr('href').split('/').pop(), elementUrl);
            return elementUrl;
        }
    }).get();

    fetchElements(elementArray);
    fs.writeFile(Settings.pathPrefix + 'elements.json', JSON.stringify(elementArray));
}

// Get table - init
function getPeriodicList(){
    var elementsList = Settings.pathPrefix + 'elements.json';

    if(path.existsSync(elementsList)){
        var fileData = JSON.parse(fs.readFileSync(elementsList, 'utf8'));
        fetchElements(fileData);
    } else {
        chain.add(createTask(Settings.periodicUrl, processTable));
    }
}

getPeriodicList();

原文

This is a simple scraper written in JavaScript with Node.js, for scraping Wikipedia for periodic table element data. The dependencies are jsdom for DOM manipulation and chain-gang for queuing.

It works fine, most of the time (it doesn't handle errors gracefully), and the code isn't too bad, dare I say for a for attempt, but there is a serious fault with it - it leaks memory horribly, anywhere from 0.3% to 0.6% of the computer's memory for each element, such that by the time it gets to lead it would be using somewhere close to 20%, which is plainly unacceptable.

I've tried working with profilers, but I have either not found them to be helpful or have difficulty interpreting the data. I suspect it has something to do with the way processElement gets passed around, but I have difficulty in rewriting the queue code into something more elegant.

var fs = require('fs'),
    path = require('path'),
    jsdom = require("jsdom"),
    parseUrl = require('url').parse,
    chainGang = require('chain-gang');

var chain = chainGang.create({
    workers: 1
});

var Settings = {
    periodicUrl: 'http://en.wikipedia.org/wiki/Template:Periodic_table',
    periodicSelector: '#bodyContent > table:first',
    pathPrefix: 'data/',
    ignoredProperties: ['Pronunciation']
};

function writeToFile(output) {
    var keys = 0;

    // Huge nests for finding the name of the element... yeah
    for(var i in output) {
        if(typeof output[i] === 'object' && output[i] !== null){
            for(var l in output[i]) {
                if(l.toLowerCase() === 'name') {
                    var name = output[i][l];
                }
            }

            keys += Object.keys(output[i]).length;
        }
    }

    console.log('Scraped ' + keys + ' properties for ' + name);
    console.log('Writing to ' + Settings.pathPrefix + name + '.json');
    fs.writeFile(Settings.pathPrefix + name + '.json', JSON.stringify(output));
}

// Generic create task function to create a task function that
// would be passed to the chain gang
function createTask (url, callback) {
    console.log('Task added - ' + url);

    return function(worker){
        console.log('Requesting: ' +url);

        jsdom.env(url, [
            'jquery.min.js' // Local copy of jQuery
        ], function(errors, window) {
            if(errors){
                console.log('Error! ' + errors)
                createTask(url, callback);
            } else {
                // Give me thy $
                var $ = window.$;

                // Cleanup - remove unneeded elements
                $.fn.cleanup = function() {
                    return this.each(function(){
                        $(this).find('sup.reference, .IPA').remove().end()
                            .find('a, b, i, small, span').replaceWith(function(){
                                return this.innerHTML;
                            }).end()
                            .find('br').replaceWith(' ');
                    });
                }

                callback($);
            }

            worker.finish();
        });
    }
}

function processElement ($){
    var infoBox = $('.infobox'),
        image = infoBox.find('tr:contains("Appearance") + tr img:first'),
        description = $('#toc').prevAll('p').cleanup(),
        headers = infoBox.find('tr:contains("properties")'),
        output = {
            Appearance: image.attr('src'),
            Description: $('.infobox + p').cleanup().html()
        };

    headers.each(function(){
        var that = this,
            title = this.textContent.trim(),
            rowspan = 0,
            rowspanHeading = '';

        output[title] = {};

        $(this).nextUntil('tr:has(th:only-child)').each(function(){
            var t = $(this).cleanup(),
                headingEle = t.children('th'),
                data = t.children('td').html().trim();

            if(headingEle.length) {
                var heading = headingEle.html().trim();
            }

            // Skip to next heading if current property is ignored
            if(~Settings.ignoredProperties.indexOf(heading)) {
                return true;
            }

            if (rowspan) {
                output[title][rowspanHeading][data.split(':')[0].trim()] = data.split(':')[1].trim();
                rowspan--;
            } else if (headingEle.attr('rowspan')){
                rowspan = headingEle.attr('rowspan') - 1;
                rowspanHeading = heading;

                output[title][heading] = {};
                output[title][heading][data.split(':')[0]] = data.split(':')[1];
            } else if (~heading.indexOf(',')){
                data = data.split(',');

                heading.split(',').forEach(function(v, i){
                    output[title][v.trim()] = data[i].trim();
                });
            } else {
                output[title][heading] = data;
            }
        });
    });

    writeToFile(output);
}

function fetchElements(elements) {
    elements.forEach(function(value){
        // Element URL used here as task id (second argument)
        chain.add(createTask(value, processElement), value);
    });
}

function processTable($){
    var elementArray = $(Settings.periodicSelector).find('td').map(function(){
        var t = $(this),
            atomicN = parseInt(t.text(), 10);

        if(atomicN && t.children('a').length) {
            var elementUrl = 'http://' + parseUrl(Settings.periodicUrl).host + t.children('a:first').attr('href');

            console.log(atomicN, t.children('a:first').attr('href').split('/').pop(), elementUrl);
            return elementUrl;
        }
    }).get();

    fetchElements(elementArray);
    fs.writeFile(Settings.pathPrefix + 'elements.json', JSON.stringify(elementArray));
}

// Get table - init
function getPeriodicList(){
    var elementsList = Settings.pathPrefix + 'elements.json';

    if(path.existsSync(elementsList)){
        var fileData = JSON.parse(fs.readFileSync(elementsList, 'utf8'));
        fetchElements(fileData);
    } else {
        chain.add(createTask(Settings.periodicUrl, processTable));
    }
}

getPeriodicList();

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

宛菡 2024-11-09 14:00:04

jsdom 确实存在内存泄漏，该泄漏源于节点 vm.runInContext() 背后的复制输入和复制输出逻辑。人们已经努力使用 C++ 来解决这个问题，我们希望在尝试将其推入节点之前证明该解决方案。

目前的解决方法是为每个 dom 生成一个子进程，并在完成后将其关闭。

编辑：

从 jsdom 0.2.3 开始，只要您在完成后关闭窗口 (window.close())，这个问题就可以解决。

回复收藏 0 原文

原野 2024-11-09 14:00:04

对于使用节点进行类似 jQuery 的 html 处理，我现在使用 cheerio 而不是 jsdom。到目前为止，我在几个小时内废弃和解析超过 10K 页面时还没有看到任何内存泄漏。

回复收藏 0 原文

清风夜微凉 2024-11-09 14:00:04

我认为我有一个更好的解决方法，通过设置 window.document.innerHTML 属性来重用 jsdom 实例。解决了我的内存泄漏问题！

    // jsdom has a memory leak when using multiple instance
    // cache a single instance and swap out innerHTML
    var dom = require('jsdom');
    var win;
    var useJQuery = function(html, fnCallback) {
        if (!win) {
            var defEnv = {
                html:html,
                scripts:['jquery-1.5.min.js'],
            };
            dom.env(defEnv, function (err, window) {
                if (err) throw new Error('failed to init dom');
                win = window;
                fnCallback(window.jQuery);
            });
        }
        else {
            win.document.innerHTML = html;
            fnCallback(win.jQuery);
        }
    };
    ....
    // Use it!
    useJQuery(html, function($) { $('woohoo').val('test'); });

I think I have a better work-around, reuse your instance of jsdom by setting the window.document.innerHTML property. Solved my memory leak problems!

    // jsdom has a memory leak when using multiple instance
    // cache a single instance and swap out innerHTML
    var dom = require('jsdom');
    var win;
    var useJQuery = function(html, fnCallback) {
        if (!win) {
            var defEnv = {
                html:html,
                scripts:['jquery-1.5.min.js'],
            };
            dom.env(defEnv, function (err, window) {
                if (err) throw new Error('failed to init dom');
                win = window;
                fnCallback(window.jQuery);
            });
        }
        else {
            win.document.innerHTML = html;
            fnCallback(win.jQuery);
        }
    };
    ....
    // Use it!
    useJQuery(html, function($) { $('woohoo').val('test'); });

回复收藏 0 原文