ericjarvies
2/16/2017 - 2:33 PM

Split & Merge tool for JSON, NDJSON, and ESJSON

Split & Merge tool for JSON, NDJSON, and ESJSON

Create Folder;

mkdir /Users/propertydb/.npm-packages/lib/node_modules/split-merge
cd ~/.npm-packages/lib/node_modules/split-merge

Create package.json;

sudo nano package.json
{
  "name": "split-merge",
  "version": "1.0.0",
  "description": "Split & Merge JSON, NDJSON & ESJSON",
  "main": "split-merge.js",
  "dependencies": {
    "command-line-args": "^4.0.1",
    "command-line-usage": "^4.0.0"
  },
  "devDependencies": {},
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "author": "collective",
  "license": "ISC",
  "preferGlobal": true,
  "bin": {
    "split-merge": "split-merge.js"
  }
}

Create split-merge.js;

#! /usr/bin/env node
var fs = require("fs");
var path = require("path");
const clu = require('command-line-usage');
const cla = require('command-line-args');

const splitOptions = [
  { name: 'split', alias: 's', type: String, arg: 'file', desc: 'a json-line file to split' },
  { name: 'name-key', alias: 'n', type: String, arg: 'key', desc: 'key for the name of file, will groups objects with the same file' },
  { name: 'path-key', alias: 'p', type: String, arg: 'key', desc: 'key for the output path' },
  { name: 'omit-name', alias: 't', type: Boolean, desc: 'omit the name key' },
  { name: 'omit-path', alias: 'u', type: Boolean, desc: 'omit the path key' },
  { name: 'out-key', alias: 'k', type: String, arg: 'key', desc: 'output the groups as array value of this key' },
];

const mergeOptions = [
  { name: 'merge', alias: 'm', type: String, arg: 'dir', desc: 'dir with json files to merge' },
  { name: 'merge-output', alias: 'o', type: String, arg: 'file', desc: 'merge output file' },
  { name: 'index', alias: 'x', type: String, arg: 'ESJSON index key', desc: 'specify index key for ESJSON' },
];

const generalOptions = [
  { name: 'output-dir', alias: 'd', type: String, defaultValue: '.', arg: 'dir', desc: 'root output dir, defaults to current dir' },
  { name: 'help', alias: 'h', type: Boolean, desc: 'show this help' },
];

const optionDefs = splitOptions.concat(mergeOptions).concat(generalOptions);

const help = [
  {
    header: 'split-merge',
    content: [
      'Split a json-line file or merge json files.',
      '',
      'A json-line file is a file containing valid json on each line.',
    ],
  },
  {
    header: 'Usage',
    content: [
      'node split-merge.js -s FILE [options]',
      'node split-merge.js -m DIR [options]',
      '',
      'The first form to split FILE.',
      'The second form to merge files in DIR.',
    ],
  },
  getSectionOption('Split options', splitOptions),
  getSectionOption('Merge options', mergeOptions),
  getSectionOption('General options', generalOptions),
];

function getSectionOption(title, optionDef) {
  return {
    header: title,
    content: optionDef.map(o => ({ 
      a: '-' + o.alias, 
      b: '--' + o.name + ' ' + (o.arg || ''), 
      c: o.desc })),
  };
}

// parse options
const opts = cla(optionDefs);
// console.log(opts);

// handle errors
if (!opts.split && !opts.merge) {
  exitErr('Please specify an action: -s (split) or -m (merge).');
} 

function exitErr(str) {
  const errorSection = {
    'header': 'Error',
    'content': str,
  };
  help.push(errorSection);
  console.log(clu(help));
  process.exit(-1);
}

// show help
if (opts.help) {
  console.log(clu(help));
  process.exit(0);
}

if (opts.split) {
  const filepath = opts.split
  // Asynchronous read
  fs.readFile(filepath, function (err, data) {
    if (err) {
      return console.error(err);
    }
    var lines = data.toString().split("\n");
    // determine the input type
    var type = "ndjson";
    // Note: The comma at the end of the line is optional. I assume the format
    // is [{object}],\n[{object}],\n[{object}]\EOF
    if (lines[0].match(/[[]]*],?/)) {
      // it's the JSON-style format [<json>],
      type = "json";
    }
    var out = "";
    for (var i = 0; i < lines.length; i++) {
      if (lines[i].trim() == "") {
        continue;
      }
      var json;
      if (type == "ndjson") {
        json = JSON.parse(lines[i]);
      }
      else if (type == "json") {
        json = JSON.parse(lines[i].match(/[([]]*)],?/)[1]);
      }

      const nameKey = opts['name-key'];
      const pathKey = opts['path-key'];

      if (!nameKey) {
        exitErr("Please specify the name-key.");
      }
      if (!pathKey) {
        exitErr("Please specify the path-key.");
      }

      const filename = json[nameKey];
      const filepath = json[pathKey];

      if (opts['omit-name']) {
        delete json[nameKey];       
      }
      if (opts['omit-path']) {
        delete json[pathKey];
      }

      const outfile = getOutputPath(filepath) + "/" + filename + ".json";

      const outKey = opts['out-key'];
      if (outKey) {
        // add it to the array on out-key
        let obj;
        if (fs.existsSync(outfile)) {
          obj = JSON.parse(fs.readFileSync(outfile));
        }
        else {
          obj = { [outKey]: [] };
        }
        obj[outKey].push(json);
        fs.writeFileSync(outfile, JSON.stringify(obj));
      }
      else {
        fs.appendFile(
          outfile,
          JSON.stringify(json) + "\n",
          function () { } // supresses warning
        );
      }
    }
  });
}
else if (opts.merge) {
  const mergeDir = opts.merge;
  var data;
  // get the desired output format from the user
  getFormat(function (format) {
    if (Number(format) == 3 && !opts.index) {
      console.log("You forgot to declare an index (e.g.- pid) at EOL, run script again.");
      process.exit();
    }
    var index = opts.index;
    var mergedString = "";
    var items = fs.readdirSync(mergeDir);
    for (var i = 0; i < items.length; i++) {
      if (items[i].endsWith(".json")) {
        data = fs.readFileSync(mergeDir + '/' + items[i], "utf8");
        for (var a in data.toString().split("\n")) {
          var item = data.toString().split("\n")[a];
          if (item != "") {
            switch (Number(format)) {
              case 1: // minified JSON
                mergedString = mergedString + "[" + item + "],\n";
                break;
              case 2: // NDJSON
                mergedString += item + "\n";
                break;
              case 3: // ESJSON
                mergedString += '{"index":{"_id":"' +
                  JSON.parse(item)[index] +
                  '"}}\n' +
                  item +
                  "\n";
                break;
              default:
                break;
            }
          }
        }
      }
    }
    const filename = opts['merge-output'];
    if (!filename) {
      exitErr('Please specify merge-output file.');
    }

    const filepath = path.join(getOutputPath(), filename); 

    var writeStream = fs.createWriteStream(filepath);
    writeStream.write(mergedString);
    writeStream.end();
    writeStream.on("finish", function () {
      process.exit();
    });
  });
}
else {
  console.log("Please provide a correct action");
}

// function to use recursion to simulate syncronous access to stdin/out
function getFormat(callback) {
  process.stdout.write(
    "Select output format: 1:minified JSON, 2: NDJSON, 3:ESJSON: "
  );
  process.stdin.setEncoding('utf8');
  process.stdin.once('data', function (val) {
    // check validity of input
    if (!isNaN(val) && 0 < Number(val) < 3) {
      callback(val);
    }
    else {
      // if input is invalid, ask again
      getFormat(callback);
    }
  }).resume();
}

function mkDir(dir) {
  return dir.split('/').reduce((path, folder) => {
    path = path + '/' + fixName(folder);
    if (!fs.existsSync(path)) {
      fs.mkdirSync(path);
    }
    return path;
  }, '');
}

function fixName(name) {
  return name.replace(/\s+/g, '_');  
}

function getOutputPath(dir='') {
  return mkDir(path.resolve(path.join(
    opts['output-dir'], 
    dir)));
}

Install & link;

npm install
npm link
cd /Users/propertydb/.npm-packages/bin
ln -s /Users/propertydb/.npm-packages/lib/node_modules/split-merge/split-merge.js  split-merge

Usage;


  node split-merge.js -s FILE [options]
  node split-merge -s FILE [options]
  node split-merge.js -m DIR [options]
  node split-merge -m DIR [options]

  The first form to split FILE.          
  The second form to merge files in DIR. 

Split options

  -s   --split file     a json-line file to split                                        
  -n   --name-key key   key for the name of file, will groups objects with the same file 
  -p   --path-key key   key for the output path                                          
  -t   --omit-name      omit the name key                                                
  -u   --omit-path      omit the path key                                                
  -j   --out-jl         output the groups as json-line instead of array                  
  -k   --out-key key    output the groups as array value of this key                     

Merge options

  -m   --merge dir                dir with json files to merge 
  -o   --merge-output file        merge output file            
  -x   --index ESJSON index key   specify index key for ESJSON 

General options

  -d   --output-dir dir   root output dir, defaults to current dir 
  -h   --help             show this help   

Split example

split-merge -s output.jl -n filename -p folder -tu

Merge examples;

split-merge -m someFolder -o esjson.json -x esIndexKey
split-merge -m someFolder -o esjson.json -grf ASSESSMENT -x ASSESSMENT[0].pid

Output normal JSON and designate name value;

split-merge -s someRecords.ndjson -n filename -p folder -tu -k userAddressKeyword

If splitting or merging errors occur on OS X;

sudo nano /Library/LaunchDaemons/limit.maxfiles.plist
<?xml version="1.0" encoding="UTF-8"?>  
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN"  
        "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">  
  <dict>
    <key>Label</key>
    <string>limit.maxfiles</string>
    <key>ProgramArguments</key>
    <array>
      <string>launchctl</string>
      <string>limit</string>
      <string>maxfiles</string>
      <string>128000</string>
      <string>768000</string>
    </array>
    <key>RunAtLoad</key>
    <true/>
    <key>ServiceIPC</key>
    <false/>
  </dict>
</plist>
sudo chown root:wheel /Library/LaunchDaemons/limit.maxfiles.plist
sudo launchctl load -w /Library/LaunchDaemons/limit.maxfiles.plist
launchctl limit maxfiles