logo

Node.js - Web Crawling

Crawling

Use request or http to get the raw html.

Request

var request = require('request');

var url = 'http://foo.com';

// plain text
request(url, function (err, res, body) {

}

// gzip
request({url: url, gzip: true}, function (err, res, body) {

}

http

var http = require('http');
http.request({
    host: 'search.twitter.com',
    path: '/search.json?' + qs.stringify({ q: search })
}, function (res) {}

If plain text

http.get(url, function (res) {
  var buffer = [];
  res
    .on('data', function (data) {
      buffer.push(data);
    })
    .on('end', function () {
      parsePage(buffer.join(''));
    });
});

If gziped

var zlib = require('zlib');

http.get(url, function (res) {
  var buffer = [];
  var gunzip = zlib.createGunzip();
  res.pipe(gunzip);

  gunzip
    .on('data', function (data) {
      buffer.push(data);
    })
    .on('end', function () {
      parsePage(buffer.join(''));
    });
});

Parsing

Use cheerio to parse html, after that everything works like jQuery.

var cheerio = require('cheerio');
request(url, function (err, res, body) {
  $ = cheerio.load(body);
  //...
});

Each

$('table tr').each(function (i, row) {
  console.log($(this).html());
});

To get a list of fields, and output the first column

$('table tr').each(function (i, row) {
  var fields = $(this).find('td');
  console.log($(fields[0]).text());
});