archwhite
8/28/2016 - 2:38 PM

js web crawler

js web crawler

var offset = 0;

var html;
var data;
var stopflag = false;

function loadNext() {
    console.log('offset = ', offset);
    var url = "http://website/ru/search.phtml?ia=M&lf=F&af=45&at=47&t=z&s_c=3159_4312_0_0&form=1&offset=";              
    url += offset;
    html.innerHTML = xhr(url);
    if (html.innerHTML == undefined) {
        console.info("This is the end");
        stopflag = true;
    }
    count = selectNames(html);
    offset += count;
}

function xhr(url) {
    var xhr = new XMLHttpRequest();
    xhr.open("GET", url, false);
    xhr.setRequestHeader("X-Requested-With", "XMLHttpRequest");
    xhr.send();

    if (xhr.status != 200) {
      // обработать ошибку
      // alert( xhr.status + ': ' + xhr.statusText ); // пример вывода: 404: Not Found
      console.info(xhr.status + ': ' + xhr.statusText + 'url = ', url);
    } else {
      // вывести результат
      return xhr.responseText;
    }
}

function getPosition(str, m, i) {
   return str.split(m, i).join(m).length;
}

function selectNames(html) {
    var regex = /data: {"profile":.*"name":"(.*)","age".*"login":"(.*)","state/g
    match = regex.exec(html.innerHTML);
    var lowername;
    var count = 0;
    while (match != null) {
        lowername = match[1].toLowerCase();
        if (lowername.startsWith('та') || lowername.startsWith('ta'))
            console.log(match[1], 'http://www.mamba.ru/' + match[2]);
        match = regex.exec(html.innerHTML);
        ++count;
    }
    return count;
}

weber = document.querySelector('section.weber');
if (weber == null) {
    html = document.createElement('section');
    html.setAttribute('class', 'weber');
    document.body.appendChild(html);

    data = document.createElement('section');
    data.setAttribute('class', 'firstName');
    document.body.appendChild(data);
}
while (stopflag == false) {
    loadNext();
}