andy0130tw
12/3/2015 - 1:13 PM

LM Image batch downloader

LM Image batch downloader

var fs = require('fs');
var process = require('process');
var zlib = require('zlib');

var request = require('request');
var async = require('async');
var fileType = require('file-type');
var sqlite = require('sqlite3');

function imgurl(id) {
    return 'https://message.learnmode.net:5443/image/' + id + '?size=X';
}

function formatHash(hash) {
    if (!hash) return '';
    return hash.substr(0, 6) + '~' + hash.substr(hash.length - 4, 4);
}

var stdout = process.stdout;
var imglist = fs.readFileSync('imglist.txt', {encoding: 'utf-8'}).split('\n');
var db = new sqlite.Database('imgdb.sqlite', sqlite.OPEN_READWRITE);
var stmt_int = db.prepare('INSERT INTO image (hash, size, content, date, format) VALUES (?,?,?,?,?)');

var cnt = 0;

async.waterfall([
function(async_next) {
    db.get('SELECT hash FROM image ORDER BY hash desc', function(err, row) {
        if (err)
            throw new Error(err);
        else if (!row) {
            console.log('No previous data available. Fetching from start...');
            async_next(null, undefined);
        } else
            async_next(null, row.hash);
    });
}, function(prevHash, async_next) {
    var idx = prevHash ? imglist.indexOf(prevHash) : 0;
    if (prevHash) {
        console.log('Fetching from #' + idx + ': ' + formatHash(prevHash) + '...');
    }
    async_next(null, idx);
}, function(idx, async_next) {
    // maybe slow here...
    imglist = imglist.slice(idx);
    var work_counter = 0;
    async.eachLimit(imglist, 20, function(v, callback) {
        work_counter++;
        async.waterfall([
        function(async_next) {
            request.get(imgurl(v), {encoding: null}, function(err, resp, body) {
                if (err) return async_next(err);
                async_next(null, resp, body);
            });
        }, function(resp, body, async_next) {
            zlib.gzip(body, function(err, buffer) {
                if (err) return async_next(err);
                async_next(null, resp, body, buffer);
            });
        }, function(resp, body, body_zipped, async_next) {
            var mime = fileType(body).mime;
            stmt_int.run([
                v,
                body.length,
                body_zipped,
                new Date(resp.headers['last-modified']).toJSON(),
                mime
            ], function(err) {
                if (err) {
                	if (err.code == 'SQLITE_CONSTRAINT')
                		async_next(null, null);
                	else return async_next(err);
                }
                async_next(null, body, body_zipped, mime);
            });
        }], function(err, body, body_zipped, mime) {
            work_counter--;
            cnt++;
            if (err) {
                console.log(err);
                throw new Error(err);
            } else if (!body) {
            	// skipped
            	console.log(
            		formatHash(v), '=== SKIPPED ==='
            	);
            	return;
            }
            console.log(
                formatHash(v),
                '[' + mime + ']',
                body.length,
                body_zipped.length,
                'cnt=' + cnt,
                'workers=' + work_counter
            );
            callback(null);
        });
    });
}], function(err, result) {
	// OAO
});