Node.js - Web Crawling
Crawling
Use request
or http
to get the raw html.
Request
var request = require('request');
var url = 'http://foo.com';
// plain text
request(url, function (err, res, body) {
}
// gzip
request({url: url, gzip: true}, function (err, res, body) {
}
http
var http = require('http');
http.request({
host: 'search.twitter.com',
path: '/search.json?' + qs.stringify({ q: search })
}, function (res) {}
If plain text
http.get(url, function (res) {
var buffer = [];
res
.on('data', function (data) {
buffer.push(data);
})
.on('end', function () {
parsePage(buffer.join(''));
});
});
If gziped
var zlib = require('zlib');
http.get(url, function (res) {
var buffer = [];
var gunzip = zlib.createGunzip();
res.pipe(gunzip);
gunzip
.on('data', function (data) {
buffer.push(data);
})
.on('end', function () {
parsePage(buffer.join(''));
});
});
Parsing
Use cheerio
to parse html, after that everything works like jQuery.
var cheerio = require('cheerio');
request(url, function (err, res, body) {
$ = cheerio.load(body);
//...
});
Each
$('table tr').each(function (i, row) {
console.log($(this).html());
});
To get a list of fields, and output the first column
$('table tr').each(function (i, row) {
var fields = $(this).find('td');
console.log($(fields[0]).text());
});