KDD Cup 1999のデータセットをWeka(ARFF)形式に変換するスクリプト
#!/usr/bin/env ruby
class KddCup2arff
def initialize
@types = Array.new
@only_in_data_types = ["snmpgetattack","named","xlock","xsnoop","sendmail","saint","apache2","udpstorm","xterm","mscan","processtable","ps","httptunnel","worm","mailbomb","sqlattack","snmpguess"] # from data file
@protocol_type = ["udp","tcp","icmp"]
@service = ["private","domain_u","http","smtp","ftp_data","ftp","eco_i","other","auth","ecr_i","IRC","X11","finger","time","domain","telnet","pop_3","ldap","login","name","ntp_u","http_443","sunrpc","printer","systat","tim_i","netstat","remote_job","link","urp_i","sql_net","bgp","pop_2","tftp_u","uucp","imap4","pm_dump","nnsp","courier","daytime","iso_tsap","echo","discard","ssh","whois","mtp","gopher","rje","ctf","supdup","hostnames","csnet_ns","uucp_path","nntp","netbios_ns","netbios_dgm","netbios_ssn","vmnet","Z39_50","exec","shell","efs","klogin","kshell","icmp"]
@flag = ["SF","RSTR","S1","REJ","S3","RSTO","S0","S2","RSTOS0","SH","OTH"]
@land = ["0","1"]
@wrong_fragment = [] # 3
@urgent = [] # 4
@hot = [] # 18
@num_failed_logins = [] # 5
@logged_in = ["0","1"]
@num_compromised = [] # 24
@root_shell = [] # 2
@su_attempted = [] # 3
@num_root = [] # 21
@num_file_creations = [] # 12
@num_shells = [] # 4
@num_access_files = [] # 5
@num_outbound_cmds = [] # 1
@is_host_login = ["0","1"]
@is_guest_login = ["0","1"]
end
def output_header(relation, name_file, data_file, out)
out.print "% KDD Cup 1999: Data\n"
out.print "% http://www.sigkdd.org/kddcup/index.php?section=1999&method=data\n"
out.print "% " + name_file + " A list of features\n"
out.print "% " + data_file + " data set\n"
end
def output_relatin(relation, out)
out.print "@RELATION" + " " + relation.to_s + "\n"
out.print "\n"
end
def output_attribute(name_file, out)
lines = 0
attributes = Array.new
open(name_file){|file|
while l = file.gets
lines += 1
if lines == 1
@types = l.chomp.sub(/\.$/,"").split(',')
@types.concat(@only_in_data_types)
# p @types
else
attribute = l.chomp.split(/(\S+): *(\S+)/)
# p attribute
attribute.shift
attribute_name = attribute.shift
attribute_type = attribute.shift
# p attribute_name
# p attribute_type
if attribute_type == "continuous."
datatype = "NUMERIC"
elsif attribute_type == "symbolic."
datatype = "string"
end
if attribute_name == "protocol_type"
datatype = "{" + @protocol_type.join(",") + "}\n"
elsif attribute_name == "service"
datatype = "{" + @service.join(",") + "}\n"
elsif attribute_name == "flag"
datatype = "{" + @flag.join(",") + "}\n"
elsif attribute_name == "land"
datatype = "{" + @land.join(",") + "}\n"
elsif attribute_name == "logged_in"
datatype = "{" + @logged_in.join(",") + "}\n"
elsif attribute_name == "is_host_login"
datatype = "{" + @is_host_login.join(",") + "}\n"
elsif attribute_name == "is_guest_login"
datatype = "{" + @is_guest_login.join(",") + "}\n"
end
attributes << "@ATTRIBUTE" + " " + attribute_name.to_s + " " + datatype.to_s + "\n"
end
end
}
attributes.each{|l|
out.print l
}
out.print "@ATTRIBUTE" + " " + "type" + " " + "{" + @types.join(",") + "}\n"
out.print "\n"
end
def output_data(data_file, out)
out.print "@DATA\n"
unknown_type = Array.new
unknown_protocol_type = Array.new
unknown_service = Array.new
unknown_flag = Array.new
unknown_land = Array.new
unknown_logged_in = Array.new
unknown_is_host_login = Array.new
unknown_is_guest_login = Array.new
open(data_file){|file|
while l = file.gets
out.print l.chomp.sub(/\.$/,""), "\n"
data = l.chomp.sub(/\.$/,"").split(',')
# type
unknown_type.push(data.last) if @types.index(data.last) == nil
# protocol_type: symbolic.
unknown_protocol_type.push(data[1]) if @protocol_type.index(data[1]) == nil
# service: symbolic.
unknown_service.push(data[2]) if @service.index(data[2]) == nil
# flag: symbolic.
unknown_flag.push(data[3]) if @flag.index(data[3]) == nil
# land: symbolic.
unknown_land.push(data[6]) if @land.index(data[6]) == nil
# logged_in: symbolic.
unknown_logged_in.push(data[11]) if @logged_in.index(data[11]) == nil
# is_host_login: symbolic.
unknown_is_host_login.push(data[20]) if @is_host_login.index(data[20]) == nil
# is_guest_login: symbolic.
unknown_is_guest_login.push(data[21]) if @is_guest_login.index(data[21]) == nil
end
}
out.print "\"" + unknown_type.uniq.join("\",\"") + "\"\n" if unknown_type.size != 0
out.print "\"" + unknown_protocol_type.uniq.join("\",\"") + "\"\n" if unknown_protocol_type.size != 0
out.print "\"" + unknown_service.uniq.join("\",\"") + "\"\n" if unknown_service.size != 0
out.print "\"" + unknown_flag.uniq.join("\",\"") + "\"\n" if unknown_flag.size != 0
out.print "\"" + unknown_land.uniq.join("\",\"") + "\"\n" if unknown_land.size != 0
out.print "\"" + unknown_logged_in.uniq.join("\",\"") + "\"\n" if unknown_logged_in.size != 0
out.print "\"" + unknown_is_host_login.uniq.join("\",\"") + "\"\n" if unknown_is_host_login.size != 0
out.print "\"" + unknown_is_guest_login.uniq.join("\",\"") + "\"\n" if unknown_is_guest_login.size != 0
end
end
if $0 == __FILE__
if ARGV.size < 2
print "usage: " + $0 + " relation" + " name_file" + " data_file\n"
exit()
else
relation = ARGV.shift
name_file = ARGV.shift
data_file = ARGV.shift
out = open(data_file + ".arff", "w")
end
# p relation
# p name_file
# p data_file
convert = KddCup2arff.new()
convert.output_header(relation, name_file, data_file, out)
convert.output_relatin(relation, out)
convert.output_attribute(name_file, out)
convert.output_data(data_file, out)
out.close
end