steveosoule
10/30/2015 - 7:41 PM

Parse Data File and Skip Duplicates

Parse Data File and Skip Duplicates

<?php

// Global Config
$counter = 0;
$max = 100000;
$ids = array();

$info = array();
$info['started'] = now();

// In-file
$in = array();
$in['filepath'] = 'data.txt';
$in['fh'] = fopen($in['filepath'], 'r');

// Out-file
$out = array();
$out['filepath'] = 'data.unique.txt';
$out['fh'] = fopen($out['filepath'], 'w');

// Read in-file & write to out-file
while (!feof($in['fh']) && $counter <= $max) {
	$counter++;
	if( $counter % ($max / 10) === 0 ){
		echo $counter.' in '.get_time_difference()."\n";
	}

	$in['line'] = fgets($in['fh']);
	$in['columns'] = explode("\t", $in['line']);
	$in['id'] = $in['columns'][0];

	if( in_array($in['id'], $ids) )
	{
		continue;
	}
	else
	{
		array_push($ids, $in['id']);
		fwrite($out['fh'], $in['line']);
	}
}

function get_time_difference(){
	global $info;
	return now() - $info['started'];
}


// Complete
fclose($in['fh']);
fclose($out['fh']);
echo "Finished";

?>