crazy4groovy
11/22/2015 - 6:12 PM

Scrape Google images search results

Scrape Google images search results

var readdir = require('recursive-readdir');
var fs = require('fs');
var co = require('co');

/*
    Simple script that removes trailing "garbage" from detectable images' filenames
    I.e. Google Image file downloads via `wget --no-check-certificate -nc -T 4 -t 4 -i images.txt`
*/

var dir = process.argv[2];

if (!dir) {
    console.log('Please specify a directory to process - Aborting.');
    return;
}

var img1 = /^.*\.(jpe?g|png|gif)$/i;
var img2 = /\.(jpe?g|png|gif)(.*)/i;

function renameFile(fileName) {
    var matched = fileName.match(img2);
    if (!matched) return;

    var type = matched[1];
    var cruft = matched[2];

    let i = fileName.indexOf(cruft);
    let goodName = fileName.substr(0, i);

    // just make sure it's a unique name
    i = goodName.indexOf(type);
    goodName = goodName.substr(0, i) + '_.' + type;

    fs.renameSync(fileName, goodName);
    console.log(fileName, ' --> ', goodName);
}

co(function* init() {
    var fileNames = yield readdir(dir);
    fileNames.forEach(f => {
        if (!img1.test(f)) renameFile(f);
    });
});
!function(document) {
  var d = document.createElement('div');
  d.style.cssText='position: fixed; top: 1em; left: 1em; z-index: 1000; background-color: rgba(188,188,88,.5); padding: 1em; border-radius: 1em; max-height: 50%; max-width: 50%; overflow: scroll;'
  d.ondblclick=d.remove;
  document.body.appendChild(d);
  var matches = [];
  function eachEl(selector, cb) {
    [].forEach.call(document.querySelectorAll(selector), cb);
  }
  var delay = +prompt('delay in ms', '250');
  var i = 0;
  eachEl('a.rg_l', function(a) {
    setTimeout(eachBigImg.bind(this, a), (i++)*delay);
  });
  function eachBigImg(a) {
    a.click();
    setTimeout(eachEl.bind(this, 'div.irc_bg div.irc_rimask a', eachImg), delay/1.5 | 0);
  }
  function eachImg(a) {
    var href = decodeURIComponent(a.href);
    var match = href.match(/imgurl=([^\&]+)/);
    //match = match?match[1]:'unmatched!! '+href;
    if (!match || ~matches.indexOf(match[1])) return;
    //console.log(match[1]);
    matches.push(match[1]);
    d.innerHTML += match[1]+'<br>';
  }
}(document);