tomgp
5/31/2016 - 1:18 PM

company house officers lists

{
  "name": "companies-house-officers",
  "version": "1.0.0",
  "description": "",
  "main": "index.js",
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "author": "",
  "license": "ISC",
  "dependencies": {
    "cheerio": "^0.20.0",
    "fetch": "^1.0.1"
  }
}

'use strict';
/*
from a list of company numbers 
retrieve all the company officers details, and svae these in a tsv file 
if you fail to get a company then save it's number in another file

usage:
    node index.js infile.txt outfile.tsv
*/

const fs = require('fs'); //to load and save files
const fetch = require("fetch").fetchUrl; //to fetch URLs
const cheerio = require('cheerio'); //to parse the html

var numbers = fs.readFileSync(
                process.argv[2], //the first argument is 'node' the second is the name of this file, the third should specify the list of URLs
                'utf-8'
            )
            .split('\n'); //split the file into an array with each line in the file being one element


let outfile = 'result.tsv';
if(process.argv[3]){
    outfile = process.argv[3]
}


console.log(`getting ${numbers.length} companies`);
            
for( let n in numbers ){ //for each number
    let url = `https://beta.companieshouse.gov.uk/company/${numbers[n]}/officers`; //build the URL
    console.log('getting ' + url)
    fetch( url , parseFile ); //fetch the URL and when it's received, parse it
}

function parseFile(err, meta, body){
    console.log( '\tparsing '+ meta.finalUrl );
    let $ = cheerio.load(body.toString());
    let i = 1;
    let numbers = new RegExp( /\d+/ );
    let companyNumber = numbers.exec( meta.finalUrl );
    let selection = $('.appointment-1')
    while(selection.length > 0){
        let officer = [
            companyNumber[0],
            $('#officer-name-' + i).text().trim(),
            $('#officer-role-' + i).text().trim(),
            $('#officer-address-value-' + i).text().trim(),
            $('#officer-date-of-birth-' + i).text().trim(),
            $('#officer-appointed-on-' + i).text().trim(),
            $('#officer-nationality-' + i).text().trim(),
            $('#officer-country-of-residence-' + i).text().trim(),
            $('#officer-occupation-' + i).text().trim()
        ]
        //write the officer to the file
        fs.appendFileSync(outfile, officer.join('\t') + '\n')
        //select the next
        i++;
        selection = $('.appointment-'+i)
    }    
    //check for a 'next' link
    let lastpagelink = $('.page').last();
    if(lastpagelink && lastpagelink.text().trim() == 'Next'){
        let nextUrl = 'https://beta.companieshouse.gov.uk' + lastpagelink.attr('href');
        console.log('getting ' + nextUrl);
        fetch( nextUrl , parseFile );
    }
}