Split & Merge tool for JSON, NDJSON, and ESJSON
Create Folder;
mkdir /Users/propertydb/.npm-packages/lib/node_modules/split-merge
cd ~/.npm-packages/lib/node_modules/split-merge
Create package.json;
sudo nano package.json
{
"name": "split-merge",
"version": "1.0.0",
"description": "Split & Merge JSON, NDJSON & ESJSON",
"main": "split-merge.js",
"dependencies": {
"command-line-args": "^4.0.1",
"command-line-usage": "^4.0.0"
},
"devDependencies": {},
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "collective",
"license": "ISC",
"preferGlobal": true,
"bin": {
"split-merge": "split-merge.js"
}
}
Create split-merge.js;
#! /usr/bin/env node
var fs = require("fs");
var path = require("path");
const clu = require('command-line-usage');
const cla = require('command-line-args');
const splitOptions = [
{ name: 'split', alias: 's', type: String, arg: 'file', desc: 'a json-line file to split' },
{ name: 'name-key', alias: 'n', type: String, arg: 'key', desc: 'key for the name of file, will groups objects with the same file' },
{ name: 'path-key', alias: 'p', type: String, arg: 'key', desc: 'key for the output path' },
{ name: 'omit-name', alias: 't', type: Boolean, desc: 'omit the name key' },
{ name: 'omit-path', alias: 'u', type: Boolean, desc: 'omit the path key' },
{ name: 'out-key', alias: 'k', type: String, arg: 'key', desc: 'output the groups as array value of this key' },
];
const mergeOptions = [
{ name: 'merge', alias: 'm', type: String, arg: 'dir', desc: 'dir with json files to merge' },
{ name: 'merge-output', alias: 'o', type: String, arg: 'file', desc: 'merge output file' },
{ name: 'index', alias: 'x', type: String, arg: 'ESJSON index key', desc: 'specify index key for ESJSON' },
];
const generalOptions = [
{ name: 'output-dir', alias: 'd', type: String, defaultValue: '.', arg: 'dir', desc: 'root output dir, defaults to current dir' },
{ name: 'help', alias: 'h', type: Boolean, desc: 'show this help' },
];
const optionDefs = splitOptions.concat(mergeOptions).concat(generalOptions);
const help = [
{
header: 'split-merge',
content: [
'Split a json-line file or merge json files.',
'',
'A json-line file is a file containing valid json on each line.',
],
},
{
header: 'Usage',
content: [
'node split-merge.js -s FILE [options]',
'node split-merge.js -m DIR [options]',
'',
'The first form to split FILE.',
'The second form to merge files in DIR.',
],
},
getSectionOption('Split options', splitOptions),
getSectionOption('Merge options', mergeOptions),
getSectionOption('General options', generalOptions),
];
function getSectionOption(title, optionDef) {
return {
header: title,
content: optionDef.map(o => ({
a: '-' + o.alias,
b: '--' + o.name + ' ' + (o.arg || ''),
c: o.desc })),
};
}
// parse options
const opts = cla(optionDefs);
// console.log(opts);
// handle errors
if (!opts.split && !opts.merge) {
exitErr('Please specify an action: -s (split) or -m (merge).');
}
function exitErr(str) {
const errorSection = {
'header': 'Error',
'content': str,
};
help.push(errorSection);
console.log(clu(help));
process.exit(-1);
}
// show help
if (opts.help) {
console.log(clu(help));
process.exit(0);
}
if (opts.split) {
const filepath = opts.split
// Asynchronous read
fs.readFile(filepath, function (err, data) {
if (err) {
return console.error(err);
}
var lines = data.toString().split("\n");
// determine the input type
var type = "ndjson";
// Note: The comma at the end of the line is optional. I assume the format
// is [{object}],\n[{object}],\n[{object}]\EOF
if (lines[0].match(/[[]]*],?/)) {
// it's the JSON-style format [<json>],
type = "json";
}
var out = "";
for (var i = 0; i < lines.length; i++) {
if (lines[i].trim() == "") {
continue;
}
var json;
if (type == "ndjson") {
json = JSON.parse(lines[i]);
}
else if (type == "json") {
json = JSON.parse(lines[i].match(/[([]]*)],?/)[1]);
}
const nameKey = opts['name-key'];
const pathKey = opts['path-key'];
if (!nameKey) {
exitErr("Please specify the name-key.");
}
if (!pathKey) {
exitErr("Please specify the path-key.");
}
const filename = json[nameKey];
const filepath = json[pathKey];
if (opts['omit-name']) {
delete json[nameKey];
}
if (opts['omit-path']) {
delete json[pathKey];
}
const outfile = getOutputPath(filepath) + "/" + filename + ".json";
const outKey = opts['out-key'];
if (outKey) {
// add it to the array on out-key
let obj;
if (fs.existsSync(outfile)) {
obj = JSON.parse(fs.readFileSync(outfile));
}
else {
obj = { [outKey]: [] };
}
obj[outKey].push(json);
fs.writeFileSync(outfile, JSON.stringify(obj));
}
else {
fs.appendFile(
outfile,
JSON.stringify(json) + "\n",
function () { } // supresses warning
);
}
}
});
}
else if (opts.merge) {
const mergeDir = opts.merge;
var data;
// get the desired output format from the user
getFormat(function (format) {
if (Number(format) == 3 && !opts.index) {
console.log("You forgot to declare an index (e.g.- pid) at EOL, run script again.");
process.exit();
}
var index = opts.index;
var mergedString = "";
var items = fs.readdirSync(mergeDir);
for (var i = 0; i < items.length; i++) {
if (items[i].endsWith(".json")) {
data = fs.readFileSync(mergeDir + '/' + items[i], "utf8");
for (var a in data.toString().split("\n")) {
var item = data.toString().split("\n")[a];
if (item != "") {
switch (Number(format)) {
case 1: // minified JSON
mergedString = mergedString + "[" + item + "],\n";
break;
case 2: // NDJSON
mergedString += item + "\n";
break;
case 3: // ESJSON
mergedString += '{"index":{"_id":"' +
JSON.parse(item)[index] +
'"}}\n' +
item +
"\n";
break;
default:
break;
}
}
}
}
}
const filename = opts['merge-output'];
if (!filename) {
exitErr('Please specify merge-output file.');
}
const filepath = path.join(getOutputPath(), filename);
var writeStream = fs.createWriteStream(filepath);
writeStream.write(mergedString);
writeStream.end();
writeStream.on("finish", function () {
process.exit();
});
});
}
else {
console.log("Please provide a correct action");
}
// function to use recursion to simulate syncronous access to stdin/out
function getFormat(callback) {
process.stdout.write(
"Select output format: 1:minified JSON, 2: NDJSON, 3:ESJSON: "
);
process.stdin.setEncoding('utf8');
process.stdin.once('data', function (val) {
// check validity of input
if (!isNaN(val) && 0 < Number(val) < 3) {
callback(val);
}
else {
// if input is invalid, ask again
getFormat(callback);
}
}).resume();
}
function mkDir(dir) {
return dir.split('/').reduce((path, folder) => {
path = path + '/' + fixName(folder);
if (!fs.existsSync(path)) {
fs.mkdirSync(path);
}
return path;
}, '');
}
function fixName(name) {
return name.replace(/\s+/g, '_');
}
function getOutputPath(dir='') {
return mkDir(path.resolve(path.join(
opts['output-dir'],
dir)));
}
Install & link;
npm install
npm link
cd /Users/propertydb/.npm-packages/bin
ln -s /Users/propertydb/.npm-packages/lib/node_modules/split-merge/split-merge.js split-merge
Usage;
node split-merge.js -s FILE [options]
node split-merge -s FILE [options]
node split-merge.js -m DIR [options]
node split-merge -m DIR [options]
The first form to split FILE.
The second form to merge files in DIR.
Split options
-s --split file a json-line file to split
-n --name-key key key for the name of file, will groups objects with the same file
-p --path-key key key for the output path
-t --omit-name omit the name key
-u --omit-path omit the path key
-j --out-jl output the groups as json-line instead of array
-k --out-key key output the groups as array value of this key
Merge options
-m --merge dir dir with json files to merge
-o --merge-output file merge output file
-x --index ESJSON index key specify index key for ESJSON
General options
-d --output-dir dir root output dir, defaults to current dir
-h --help show this help
Split example
split-merge -s output.jl -n filename -p folder -tu
Merge examples;
split-merge -m someFolder -o esjson.json -x esIndexKey
split-merge -m someFolder -o esjson.json -grf ASSESSMENT -x ASSESSMENT[0].pid
Output normal JSON and designate name value;
split-merge -s someRecords.ndjson -n filename -p folder -tu -k userAddressKeyword
If splitting or merging errors occur on OS X;
sudo nano /Library/LaunchDaemons/limit.maxfiles.plist
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN"
"http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Label</key>
<string>limit.maxfiles</string>
<key>ProgramArguments</key>
<array>
<string>launchctl</string>
<string>limit</string>
<string>maxfiles</string>
<string>128000</string>
<string>768000</string>
</array>
<key>RunAtLoad</key>
<true/>
<key>ServiceIPC</key>
<false/>
</dict>
</plist>
sudo chown root:wheel /Library/LaunchDaemons/limit.maxfiles.plist
sudo launchctl load -w /Library/LaunchDaemons/limit.maxfiles.plist
launchctl limit maxfiles