Node.js获取请求ETIMEDOUT和ESOCKETTIMEDOUT

 蜡笔小新11953150 发布于 2023-01-08 13:46

我使用Node.js的-异步及请求模块抓取100+百万计的网站,我不断碰到的错误ESOCKETTIMEDOUTETIMEDOUT几分钟后.

重新启动脚本后它再次起作用.它似乎不是连接限制问题,因为我仍然可以执行resolve4,resolveNs,resolveMx,也curl没有延迟.

你看到代码有什么问题吗?或任何建议?我想把async.queue()并发推高到至少1000.谢谢.

var request = require('request'),
    async = require('async'),
    mysql = require('mysql'),
    dns = require('dns'),
    url = require('url'),
    cheerio = require('cheerio'),
    iconv = require('iconv-lite'),
    charset = require('charset'),
    config = require('./spy.config'),
    pool = mysql.createPool(config.db);

iconv.skipDecodeWarning = true;

var queue = async.queue(function (task, cb) {
    dns.resolve4('www.' + task.domain, function (err, addresses) {
        if (err) {
            //
            // Do something
            //
            setImmediate(function () {
                cb()
            });
        } else {
            request({
                url: 'http://www.' + task.domain,
                method: 'GET',
                encoding:       'binary',
                followRedirect: true,
                pool:           false,
                pool:           { maxSockets: 1000 },
                timeout:        15000 // 15 sec
            }, function (error, response, body) {

                //console.info(task);

                if (!error) {
                  // If ok, do something

                } else {
                    // If not ok, do these

                    console.log(error);

                    // It keeps erroring here after few minutes, resolve4, resolveNs, resolveMx still work here.

                    // { [Error: ETIMEDOUT] code: 'ETIMEDOUT' }
                    // { [Error: ESOCKETTIMEDOUT] code: 'ESOCKETTIMEDOUT' }

                    var ns = [],
                        ip = [],
                        mx = [];
                    async.parallel([
                        function (callback) {
                            // Resolves the domain's name server records
                            dns.resolveNs(task.domain, function (err, addresses) {
                                if (!err) {
                                    ns = addresses;
                                }
                                callback();
                            });
                        }, function (callback) {
                            // Resolves the domain's IPV4 addresses
                            dns.resolve4(task.domain, function (err, addresses) {
                                if (!err) {
                                    ip = addresses;
                                }
                                callback();
                            });
                        }, function (callback) {
                            // Resolves the domain's MX records
                            dns.resolveMx(task.domain, function (err, addresses) {
                                if (!err) {
                                    addresses.forEach(function (a) {
                                        mx.push(a.exchange);
                                    });
                                }
                                callback();
                            });
                        }
                    ], function (err) {
                        if (err) return next(err);

                        // do something
                    });

                }
                setImmediate(function () {
                    cb()
                });
            });
        }
    });
}, 200);

// When the queue is emptied we want to check if we're done
queue.drain = function () {
    setImmediate(function () {
        checkDone()
    });
};
function consoleLog(msg) {
    //console.info(msg);
}
function checkDone() {
    if (queue.length() == 0) {
        setImmediate(function () {
            crawlQueue()
        });
    } else {
        console.log("checkDone() not zero");
    }
}

function query(sql) {
    pool.getConnection(function (err, connection) {
        if (!err) {
            //console.log(sql);
            connection.query(sql, function (err, results) {
                connection.release();
            });
        }
    });
}

function crawlQueue() {
    pool.getConnection(function (err, connection) {
        if (!err) {
            var sql = "SELECT * FROM domain last_update < (UNIX_TIMESTAMP() - 2592000) LIMIT 500";
            connection.query(sql, function (err, results) {
                if (!err) {
                    if (results.length) {
                        for (var i = 0, len = results.length; i < len; ++i) {
                            queue.push({"id": results[i]['id'], "domain": results[i]['domain'] });
                        }
                    } else {
                        process.exit();
                    }
                    connection.release();
                } else {
                    connection.release();
                    setImmediate(function () {
                        crawlQueue()
                    });
                }
            });
        } else {
            setImmediate(function () {
                crawlQueue()
            });
        }
    });
}
setImmediate(function () {
    crawlQueue()
});

系统限制非常高.

    Limit                     Soft Limit           Hard Limit           Units
    Max cpu time              unlimited            unlimited            seconds
    Max file size             unlimited            unlimited            bytes
    Max data size             unlimited            unlimited            bytes
    Max stack size            8388608              unlimited            bytes
    Max core file size        0                    unlimited            bytes
    Max resident set          unlimited            unlimited            bytes
    Max processes             257645               257645               processes
    Max open files            500000               500000               files
    Max locked memory         65536                65536                bytes
    Max address space         unlimited            unlimited            bytes
    Max file locks            unlimited            unlimited            locks
    Max pending signals       257645               257645               signals
    Max msgqueue size         819200               819200               bytes
    Max nice priority         0                    0
    Max realtime priority     0                    0
    Max realtime timeout      unlimited            unlimited            us

的sysctl

net.ipv4.ip_local_port_range = 10000    61000

Motiejus Jak.. 17

默认情况下,Node有4个工作人员来解析DNS查询.如果您的DNS查询需要很长时间,请求将阻止DNS阶段,并且症状正好ESOCKETTIMEDOUTETIMEDOUT.

尝试增加你的uv线程池大小:

export UV_THREADPOOL_SIZE=128
node ...

或在index.js(或在你的切入点的任何地方):

#!/usr/bin/env node
process.env.UV_THREADPOOL_SIZE = 128;

function main() {
   ...
}

编辑:这是一篇关于它的博客文章.

2 个回答
  • 我有同样的问题。阅读此讨论后,可以通过在request选项中使用“ agent:false”来解决此问题。

    10/31/2017上面的原始回复似乎并未完全解决问题。我们找到的最终解决方案是在代理中使用keepAlive选项。例如:

    var pool = new https.Agent({ keepAlive: true });
    
    function getJsonOptions(_url) {
        return {
            url: _url,
            method: 'GET',
            agent: pool,
            json: true
        };
    }
    

    节点的默认池似乎默认为keepAlive = false,这将导致在每个请求上创建一个新连接。如果在短时间内创建太多连接,则会出现上述错误。我的猜测是,沿着服务路径的一个或多个路由器可能会阻止连接请求,可能是因为怀疑“拒绝服务”攻击。无论如何,以上代码示例完全解决了我们的问题。

    2023-01-08 13:48 回答
  • 默认情况下,Node有4个工作人员来解析DNS查询.如果您的DNS查询需要很长时间,请求将阻止DNS阶段,并且症状正好ESOCKETTIMEDOUTETIMEDOUT.

    尝试增加你的uv线程池大小:

    export UV_THREADPOOL_SIZE=128
    node ...
    

    或在index.js(或在你的切入点的任何地方):

    #!/usr/bin/env node
    process.env.UV_THREADPOOL_SIZE = 128;
    
    function main() {
       ...
    }
    

    编辑:这是一篇关于它的博客文章.

    2023-01-08 13:48 回答
撰写答案
今天,你开发时遇到什么问题呢?
立即提问
热门标签
PHP1.CN | 中国最专业的PHP中文社区 | PNG素材下载 | DevBox开发工具箱 | json解析格式化 |PHP资讯 | PHP教程 | 数据库技术 | 服务器技术 | 前端开发技术 | PHP框架 | 开发工具 | 在线工具
Copyright © 1998 - 2020 PHP1.CN. All Rights Reserved 京公网安备 11010802041100号 | 京ICP备19059560号-4 | PHP1.CN 第一PHP社区 版权所有