kernelp4nic
3/22/2014 - 6:15 PM

Build an inverted index for a full-text search engine with Redis.

Build an inverted index for a full-text search engine with Redis.

# Build an inverted index for a full-text search engine with Redis.
# Copyright (C) 2009 Salvatore Sanfilippo. Under the BSD License.
# USAGE:
#
# ruby invertedindex.rb add somedir/*.c
# ruby invertedindex.rb add somedir/*.txt
# ruby search your query string

require 'rubygems'
require 'redis'
require 'digest/md5'

R = Redis.new

def get_unique_id(object,token)
    md5 = Digest::MD5.hexdigest(token)
    id = R.get("#{object}:#{md5}:id")
    return id.to_i if id
    id = R.incr("#{object}:next.id")
    R.set("#{object}:#{id}:string",token)
    if !R.setnx("#{object}:#{md5}:id",id)
        # Someone added the new token faster than us.
        R.del("#{object}:#{id}:string")
        get_token_id(object,token)
    else
        id.to_i
    end
end

def invert_file filename
    f = File.open(filename)
    document_id = get_unique_id("file",filename)
    f.each_line{|l|
        l.strip.split(/ |,|\)|\(|\;|\./).each{|word|
            word_id = get_unique_id("word",word)
            R.sadd("wordindex:#{word_id}",document_id)
        }
    }
end

case ARGV[0]
when "add"
    ARGV[1..-1].each{|filename|
        puts "Indexing #{filename}"
        invert_file filename
    }
when "search"
    sets = ARGV[1..-1].map{|w| "wordindex:#{get_unique_id("word",w)}"}
    files = R.sinter(*sets)
    files.each{|f|
        puts " - #{R.get("file:#{f}:string")}"
    }
else
    puts "Usage: invertedindex.rb add [filename1] [filename2] ... [filenameN]"
    puts "Usage: invertedindex.rb search your query string"
end