seancdavis
10/12/2014 - 4:11 PM

Related Content (without metadata) in Rails using tf-idf

Related Content (without metadata) in Rails using tf-idf

# app/controllers/posts_controller.rb

class PostsController < ApplicationController
  
  def show
    @post = Post.find_by_id(params[:id])
    @related = @post.related.first(3)
  end

  def new
    @post = Post.new
  end

  def create
    @post = Post.new(post_params)
    if @post.save
      @post.update_related!
      @post.related.each { |p| p.update_related! }
      redirect_to @post, :notice => "Post was created successfully."
    else
      render 'new'
    end
  end

  def edit
    @post = Post.find_by_id(params[:id])
  end

  def update
    @post = Post.find_by_id(params[:id])
    if @post.update(post_params)
      @post.update_related!
      @post.related.each { |p| p.update_related! }
      redirect_to @post, :notice => "Post was updated successfully."
    else
      render 'edit'
    end
  end

  private

    def post_params
      params.require(:post).permit(:title, :body, :published)
    end

end
# app/models/post.rb

include ActionView::Helpers::SanitizeHelper

class Post < ActiveRecord::Base

  after_save :update_words!

  def update_words!
    require 'htmlentities'; require 'nokogiri'
    doc = Nokogiri::HTML.parse(body)
    doc.xpath("//pre").remove.xpath("//code").remove
    words = doc.text.gsub(/\n/, '').downcase
    words = HTMLEntities.new.decode(sanitize(words, :tags => []))
    words = words.split(/\ |\.|\,|\!|\?|\//).reject(&:blank?).sort.join(',')
    words.gsub(/[^a-z\,]/i, '').split(',').reject(&:blank?).sort.join(',')
    update_columns(:words => words)
  end
  
  def update_related!
    posts = Post.all; related = {}
    ifd = inverse_document_frequency(posts)
    (posts.select(&:published?) - [self]).each do |post|
      score = 0
      intersection = self.words.split(',').multiset(post.words.split(','))
      intersection.each { |word| score += ifd[word] }
      related[post.id] = score
    end
    related = related.sort_by { |k,v| v }.reverse
    related = related.collect { |k,v| k }.first(3).join(',')
    update_columns(:related_posts => related)
  end
  
  def related
    Post.published.where(:id => related_posts.split(','))
  end
  
  private

    def inverse_document_frequency(posts)
      words = {}
      posts.each do |post|
        RelatedPost.process_words(post.body) if post.words.blank?
        post.words.split(',').uniq.each do |word|
          words[word] = 0 if words[word].nil?
          words[word] += 1
        end
      end
      words.each do |word, freq|
        words[word] = Math.log(posts.size / freq)
      end
      words
    end

end
# config/initializers/array.rb

class Array

  def multiset(arr)
    result=[]
    h1,h2=Hash.new(0),Hash.new(0)
    self.each { |x| h1[x] += 1 }
    arr.each { |x| h2[x] += 1 }
    h1.each_pair { |k,v| result << [k] * [v, h2[k]].min if h2[k] != 0 }
    result.flatten
  end

end
$ bundle exec rails g migration add_words_to_posts words:text
$ bundle exec rake db:migrate

$ bundle install

$ bundle exec rails g migration add_related_posts_to_posts related_posts
$ bundle exec rake db:migrate
gem 'htmlentities'
gem 'nokogiri'