akiyoshi83
5/15/2014 - 3:10 PM

CasperJS scraping sample

CasperJS scraping sample

LANG="ja_JP.SJIS"
LC_ALL="ja_JP.SJIS"

casperjs scrape.js $1
var system = require('system');
var fs = require('fs');
var args = system.args;

// args is phantomjs api.
// args [3] and later arguments of casperjs
var _args = args.slice(3);
if(_args.length < 2) {
  console.log("USAGE: casperjs scrape.js URL");
  phantom.exit();
}

// parse arguments
// --------------------
var url = _args[1] || '';
var domain, path, fname;
var m = url.match(/https?:¥/¥/([^¥/]+)(¥/?.*)/);

if(!m) {
  console.log("casperjs scrape.js URL");
  phantom.exit();
}

domain = m[1];
path = m[2];
fname = m[3] || 'index.html';

console.debug('domain: ' + domain);
console.debug('path: ' + path);
console.debug('fname: ' + fname);

// scraping by casperjs
// --------------------
var casper = require('casper').create();

casper.start(url, function() {
  this.evaluate(function() {
    // to prevent from transparent background color
    document.body.bgColor = 'white';
  });
  // TODO make directory hierarchy
  this.capture(fname + '.png');
  fs.write(fname, this.getHTML(), 'w');
  // TODO download resources
  this.exit();
});

casper.run();