ayaniimi213
3/23/2013 - 6:41 AM

KDD Cup 1999のデータセットをWeka(ARFF)形式に変換するスクリプト

KDD Cup 1999のデータセットをWeka(ARFF)形式に変換するスクリプト

#!/usr/bin/env ruby

class KddCup2arff
  
  def initialize
    @types = Array.new
    @only_in_data_types = ["snmpgetattack","named","xlock","xsnoop","sendmail","saint","apache2","udpstorm","xterm","mscan","processtable","ps","httptunnel","worm","mailbomb","sqlattack","snmpguess"] # from data file
    @protocol_type = ["udp","tcp","icmp"]
    @service = ["private","domain_u","http","smtp","ftp_data","ftp","eco_i","other","auth","ecr_i","IRC","X11","finger","time","domain","telnet","pop_3","ldap","login","name","ntp_u","http_443","sunrpc","printer","systat","tim_i","netstat","remote_job","link","urp_i","sql_net","bgp","pop_2","tftp_u","uucp","imap4","pm_dump","nnsp","courier","daytime","iso_tsap","echo","discard","ssh","whois","mtp","gopher","rje","ctf","supdup","hostnames","csnet_ns","uucp_path","nntp","netbios_ns","netbios_dgm","netbios_ssn","vmnet","Z39_50","exec","shell","efs","klogin","kshell","icmp"]
    @flag = ["SF","RSTR","S1","REJ","S3","RSTO","S0","S2","RSTOS0","SH","OTH"]
    @land = ["0","1"]
    @wrong_fragment = [] # 3
    @urgent = [] # 4
    @hot = [] # 18
    @num_failed_logins = [] # 5
    @logged_in = ["0","1"]
    @num_compromised = [] # 24
    @root_shell = [] # 2
    @su_attempted = [] # 3
    @num_root = [] # 21
    @num_file_creations = [] # 12
    @num_shells = [] # 4
    @num_access_files = [] # 5
    @num_outbound_cmds = [] # 1
    @is_host_login = ["0","1"]
    @is_guest_login = ["0","1"]

  end
  
  def output_header(relation, name_file, data_file, out)
    out.print "% KDD Cup 1999: Data\n"
    out.print "% http://www.sigkdd.org/kddcup/index.php?section=1999&method=data\n"
    out.print "% " + name_file + " A list of features\n"
    out.print "% " + data_file + " data set\n"
  end


  def output_relatin(relation, out)
    out.print "@RELATION" + " " + relation.to_s + "\n"
    out.print "\n"
  end
  
  def output_attribute(name_file, out)
    lines = 0
    attributes = Array.new
    open(name_file){|file|
      while l = file.gets
        lines += 1
        if lines == 1
          @types = l.chomp.sub(/\.$/,"").split(',')
          @types.concat(@only_in_data_types)
          #     p @types
        else
          attribute = l.chomp.split(/(\S+): *(\S+)/)
          #      p attribute
          attribute.shift
          attribute_name = attribute.shift
          attribute_type = attribute.shift
          #      p attribute_name
          #      p attribute_type
          if attribute_type == "continuous."
            datatype = "NUMERIC"
          elsif attribute_type == "symbolic."
            datatype = "string"
          end
          if attribute_name == "protocol_type"
            datatype = "{" + @protocol_type.join(",") + "}\n"
          elsif attribute_name == "service"
            datatype = "{" + @service.join(",") + "}\n"
          elsif attribute_name == "flag"
            datatype = "{" + @flag.join(",") + "}\n"
          elsif attribute_name == "land"
            datatype = "{" + @land.join(",") + "}\n"
          elsif attribute_name == "logged_in"
            datatype = "{" + @logged_in.join(",") + "}\n"
          elsif attribute_name == "is_host_login"
            datatype = "{" + @is_host_login.join(",") + "}\n"
          elsif attribute_name == "is_guest_login"
            datatype = "{" + @is_guest_login.join(",") + "}\n"
          end
            attributes << "@ATTRIBUTE" + " " + attribute_name.to_s + " " + datatype.to_s + "\n"
        end
      end
    }
    attributes.each{|l|
      out.print l
    }
    out.print "@ATTRIBUTE" + " " + "type" + " " + "{" + @types.join(",") + "}\n"
    out.print "\n"
  end
  
  def output_data(data_file, out)
    out.print "@DATA\n"
    unknown_type = Array.new
    unknown_protocol_type = Array.new
    unknown_service = Array.new
    unknown_flag = Array.new
    unknown_land = Array.new
    unknown_logged_in = Array.new
    unknown_is_host_login = Array.new
    unknown_is_guest_login = Array.new
    
    open(data_file){|file|
      while l = file.gets
        out.print l.chomp.sub(/\.$/,""), "\n"
        
        data = l.chomp.sub(/\.$/,"").split(',')
        # type
        unknown_type.push(data.last) if @types.index(data.last) == nil
        
        # protocol_type: symbolic.
        unknown_protocol_type.push(data[1]) if @protocol_type.index(data[1]) == nil
        # service: symbolic.
        unknown_service.push(data[2]) if @service.index(data[2]) == nil
        # flag: symbolic.
        unknown_flag.push(data[3]) if @flag.index(data[3]) == nil
        # land: symbolic.
        unknown_land.push(data[6]) if @land.index(data[6]) == nil
        # logged_in: symbolic.
        unknown_logged_in.push(data[11]) if @logged_in.index(data[11]) == nil
        # is_host_login: symbolic.
        unknown_is_host_login.push(data[20]) if @is_host_login.index(data[20]) == nil
        # is_guest_login: symbolic.
        unknown_is_guest_login.push(data[21]) if @is_guest_login.index(data[21]) == nil
      end
    }
    
    out.print "\"" + unknown_type.uniq.join("\",\"") + "\"\n" if unknown_type.size != 0
    out.print "\"" + unknown_protocol_type.uniq.join("\",\"") + "\"\n" if unknown_protocol_type.size != 0
    out.print "\"" + unknown_service.uniq.join("\",\"") + "\"\n" if unknown_service.size != 0
    out.print "\"" + unknown_flag.uniq.join("\",\"") + "\"\n" if unknown_flag.size != 0
    out.print "\"" + unknown_land.uniq.join("\",\"") + "\"\n" if unknown_land.size != 0
    out.print "\"" + unknown_logged_in.uniq.join("\",\"") + "\"\n" if unknown_logged_in.size != 0
    out.print "\"" + unknown_is_host_login.uniq.join("\",\"") + "\"\n" if unknown_is_host_login.size != 0
    out.print "\"" + unknown_is_guest_login.uniq.join("\",\"") + "\"\n" if unknown_is_guest_login.size != 0
  end

end

if $0 == __FILE__
  if ARGV.size < 2
    print "usage: " + $0 + " relation" + " name_file" + " data_file\n"
    exit()
  else
    relation = ARGV.shift
    name_file = ARGV.shift
    data_file = ARGV.shift
    out = open(data_file + ".arff", "w")
  end

#  p relation
#  p name_file
#  p data_file
  
  convert = KddCup2arff.new()

  convert.output_header(relation, name_file, data_file, out)
  convert.output_relatin(relation, out)
  convert.output_attribute(name_file, out)
  convert.output_data(data_file, out)

  out.close
end