'use strict';
/*
from a list of company numbers
retrieve all the company officers details, and svae these in a tsv file
if you fail to get a company then save it's number in another file
usage:
node index.js infile.txt outfile.tsv
*/
const fs = require('fs'); //to load and save files
const fetch = require("fetch").fetchUrl; //to fetch URLs
const cheerio = require('cheerio'); //to parse the html
var numbers = fs.readFileSync(
process.argv[2], //the first argument is 'node' the second is the name of this file, the third should specify the list of URLs
'utf-8'
)
.split('\n'); //split the file into an array with each line in the file being one element
let outfile = 'result.tsv';
if(process.argv[3]){
outfile = process.argv[3]
}
console.log(`getting ${numbers.length} companies`);
for( let n in numbers ){ //for each number
let url = `https://beta.companieshouse.gov.uk/company/${numbers[n]}/officers`; //build the URL
console.log('getting ' + url)
fetch( url , parseFile ); //fetch the URL and when it's received, parse it
}
function parseFile(err, meta, body){
console.log( '\tparsing '+ meta.finalUrl );
let $ = cheerio.load(body.toString());
let i = 1;
let numbers = new RegExp( /\d+/ );
let companyNumber = numbers.exec( meta.finalUrl );
let selection = $('.appointment-1')
while(selection.length > 0){
let officer = [
companyNumber[0],
$('#officer-name-' + i).text().trim(),
$('#officer-role-' + i).text().trim(),
$('#officer-address-value-' + i).text().trim(),
$('#officer-date-of-birth-' + i).text().trim(),
$('#officer-appointed-on-' + i).text().trim(),
$('#officer-nationality-' + i).text().trim(),
$('#officer-country-of-residence-' + i).text().trim(),
$('#officer-occupation-' + i).text().trim()
]
//write the officer to the file
fs.appendFileSync(outfile, officer.join('\t') + '\n')
//select the next
i++;
selection = $('.appointment-'+i)
}
//check for a 'next' link
let lastpagelink = $('.page').last();
if(lastpagelink && lastpagelink.text().trim() == 'Next'){
let nextUrl = 'https://beta.companieshouse.gov.uk' + lastpagelink.attr('href');
console.log('getting ' + nextUrl);
fetch( nextUrl , parseFile );
}
}