我使用Node.js的-异步及请求模块抓取100+百万计的网站,我不断碰到的错误ESOCKETTIMEDOUT
及ETIMEDOUT
几分钟后.
重新启动脚本后它再次起作用.它似乎不是连接限制问题,因为我仍然可以执行resolve4,resolveNs,resolveMx,也curl
没有延迟.
你看到代码有什么问题吗?或任何建议?我想把async.queue()并发推高到至少1000.谢谢.
var request = require('request'), async = require('async'), mysql = require('mysql'), dns = require('dns'), url = require('url'), cheerio = require('cheerio'), iconv = require('iconv-lite'), charset = require('charset'), config = require('./spy.config'), pool = mysql.createPool(config.db); iconv.skipDecodeWarning = true; var queue = async.queue(function (task, cb) { dns.resolve4('www.' + task.domain, function (err, addresses) { if (err) { // // Do something // setImmediate(function () { cb() }); } else { request({ url: 'http://www.' + task.domain, method: 'GET', encoding: 'binary', followRedirect: true, pool: false, pool: { maxSockets: 1000 }, timeout: 15000 // 15 sec }, function (error, response, body) { //console.info(task); if (!error) { // If ok, do something } else { // If not ok, do these console.log(error); // It keeps erroring here after few minutes, resolve4, resolveNs, resolveMx still work here. // { [Error: ETIMEDOUT] code: 'ETIMEDOUT' } // { [Error: ESOCKETTIMEDOUT] code: 'ESOCKETTIMEDOUT' } var ns = [], ip = [], mx = []; async.parallel([ function (callback) { // Resolves the domain's name server records dns.resolveNs(task.domain, function (err, addresses) { if (!err) { ns = addresses; } callback(); }); }, function (callback) { // Resolves the domain's IPV4 addresses dns.resolve4(task.domain, function (err, addresses) { if (!err) { ip = addresses; } callback(); }); }, function (callback) { // Resolves the domain's MX records dns.resolveMx(task.domain, function (err, addresses) { if (!err) { addresses.forEach(function (a) { mx.push(a.exchange); }); } callback(); }); } ], function (err) { if (err) return next(err); // do something }); } setImmediate(function () { cb() }); }); } }); }, 200); // When the queue is emptied we want to check if we're done queue.drain = function () { setImmediate(function () { checkDone() }); }; function consoleLog(msg) { //console.info(msg); } function checkDone() { if (queue.length() == 0) { setImmediate(function () { crawlQueue() }); } else { console.log("checkDone() not zero"); } } function query(sql) { pool.getConnection(function (err, connection) { if (!err) { //console.log(sql); connection.query(sql, function (err, results) { connection.release(); }); } }); } function crawlQueue() { pool.getConnection(function (err, connection) { if (!err) { var sql = "SELECT * FROM domain last_update < (UNIX_TIMESTAMP() - 2592000) LIMIT 500"; connection.query(sql, function (err, results) { if (!err) { if (results.length) { for (var i = 0, len = results.length; i < len; ++i) { queue.push({"id": results[i]['id'], "domain": results[i]['domain'] }); } } else { process.exit(); } connection.release(); } else { connection.release(); setImmediate(function () { crawlQueue() }); } }); } else { setImmediate(function () { crawlQueue() }); } }); } setImmediate(function () { crawlQueue() });
系统限制非常高.
Limit Soft Limit Hard Limit Units Max cpu time unlimited unlimited seconds Max file size unlimited unlimited bytes Max data size unlimited unlimited bytes Max stack size 8388608 unlimited bytes Max core file size 0 unlimited bytes Max resident set unlimited unlimited bytes Max processes 257645 257645 processes Max open files 500000 500000 files Max locked memory 65536 65536 bytes Max address space unlimited unlimited bytes Max file locks unlimited unlimited locks Max pending signals 257645 257645 signals Max msgqueue size 819200 819200 bytes Max nice priority 0 0 Max realtime priority 0 0 Max realtime timeout unlimited unlimited us
的sysctl
net.ipv4.ip_local_port_range = 10000 61000
Motiejus Jak.. 17
默认情况下,Node有4个工作人员来解析DNS查询.如果您的DNS查询需要很长时间,请求将阻止DNS阶段,并且症状正好ESOCKETTIMEDOUT
或ETIMEDOUT
.
尝试增加你的uv线程池大小:
export UV_THREADPOOL_SIZE=128 node ...
或在index.js
(或在你的切入点的任何地方):
#!/usr/bin/env node process.env.UV_THREADPOOL_SIZE = 128; function main() { ... }
编辑:这是一篇关于它的博客文章.
我有同样的问题。阅读此讨论后,可以通过在request选项中使用“ agent:false”来解决此问题。
10/31/2017上面的原始回复似乎并未完全解决问题。我们找到的最终解决方案是在代理中使用keepAlive选项。例如:
var pool = new https.Agent({ keepAlive: true }); function getJsonOptions(_url) { return { url: _url, method: 'GET', agent: pool, json: true }; }
节点的默认池似乎默认为keepAlive = false,这将导致在每个请求上创建一个新连接。如果在短时间内创建太多连接,则会出现上述错误。我的猜测是,沿着服务路径的一个或多个路由器可能会阻止连接请求,可能是因为怀疑“拒绝服务”攻击。无论如何,以上代码示例完全解决了我们的问题。
默认情况下,Node有4个工作人员来解析DNS查询.如果您的DNS查询需要很长时间,请求将阻止DNS阶段,并且症状正好ESOCKETTIMEDOUT
或ETIMEDOUT
.
尝试增加你的uv线程池大小:
export UV_THREADPOOL_SIZE=128 node ...
或在index.js
(或在你的切入点的任何地方):
#!/usr/bin/env node process.env.UV_THREADPOOL_SIZE = 128; function main() { ... }
编辑:这是一篇关于它的博客文章.