This script processes the YML output of Simian (http://www.harukizaemon.com/simian/) and generates some useful summarized information
#!/usr/bin/env ruby
require 'yaml'
require 'pp'
if ARGV.count == 0
puts "Usage: simian-sum <simian-result.yml>"
puts
puts "Remember to remove the first 3 lines of the yml file"
exit
end
simian = YAML.load_file(ARGV[0])
sets = simian['simian']['checks'][0]['sets']
puts "Duplications found: #{sets.count}"
puts
files = {}
sets.each do |set|
lineCount = set['lineCount'].to_i
set['blocks'].each do |block|
sourceFile = block['sourceFile']
file = files[sourceFile] || {}
file['lineCount'] = lineCount + (file['lineCount'] || 0)
file['blockCount'] = 1 + (file['blockCount'] || 0)
files[sourceFile] = file
end
end
files = files.sort_by {|k,v| -v['lineCount']}.first(10)
puts "FILES MOST DUPLICATED"
puts "file, numDuplications, numDuplicatedLines"
puts files.map {|f| "#{f[0]}, #{f[1]['blockCount']}, #{f[1]['lineCount']}"}
puts
puts "LARGEST DUPLICATION BLOCKS"
puts "lineCount, fileCount"
sets = sets.sort_by {|s| -s['lineCount']}.first(10)
puts sets.each_with_index.map {|s,i| "#{s['lineCount']}, #{s['blocks'].count}"}
puts
summary = simian['simian']['checks'][0]['summary']
puts "SUMMARY"
puts "Total number of duplicate lines: #{summary['duplicateLineCount']}"
puts "Total number of duplicate blocks: #{summary['duplicateBlockCount']}"
puts "Total number of files with duplicates: #{summary['duplicateFileCount']}"
puts "Total number of files: #{summary['totalFileCount']}"
puts "Total number of significant lines: #{summary['totalSignificantLineCount']}"
percent = (summary['duplicateLineCount'].to_f / summary['totalSignificantLineCount'].to_f) * 100
puts "% Duplication: #{percent}%"