darcy
6/8/2012 - 2:51 PM

MongoDB Trigram Search

MongoDB Trigram Search

// THIS SHOULD WORK AS COPY/PASTE into mongo console

var games = ["here is a test", "small test", "another game"];
var query =  'here is testing a test';

//////////////////////////////////////////////

// 1. util functions

var intersect = function(arr1, arr2) {
  var r = [], o = {}, l = arr2.length, i, v;
    for (i = 0; i < l; i++) {
        o[arr2[i]] = true;
    }
    l = arr1.length;
    for (i = 0; i < l; i++) {
        v = arr1[i];
        if (v in o) {
            r.push(v);
        }
    }
    return r;
};
var trigramize = function(s) {
  //normalize string
  var w = ' '+s.toLowerCase().trim();
  //trigramize it
  var tg = [];
  for (var i=0; i<Math.min(s.length,20); i++) {
    tg.push(w.substring(i,i+3));
  }
  // get the unique trigrams 
  // (http://www.shamasis.net/2009/09/fast-algorithm-to-find-unique-items-in-javascript-array/)
  var o = {}, i, l = tg.length, r = [];
  for(i=0; i<l;i+=1) o[tg[i]] = tg[i];
  for(i in o) r.push(o[i]);

  return r;
}


// 2. create games 

db.foo.drop();
games.forEach(function(g) {
  db.foo.save({name : g, trigrams : trigramize(g) });
});


// 3. search function

var search = function(s) {
  var map = function() {
    emit(this, { trigram_match : intersect(this.trigrams, trigrams).length });
  };

  var reduce=function(key, values) {
    return values;
  }

  var trigram_query = trigramize(s);
  var res_name = ObjectId().toString();
  var res = db.foo.mapReduce(map,reduce,{ 
    out: res_name, // we could make this inline so our results aren't global, however then you don't have collection functions like sort or limit, etc
    query: {trigrams : {$in :trigram_query}}, // filter out only trigram 'hits'
    scope : { 
      trigrams : trigram_query, // pass our query param to map/reduce
      intersect : intersect  // pass the intersect function helper
    }
  });
  // res.find();
  var results = db[res_name].find().sort({value: -1}).limit(20).toArray(); //map(function(t) { return t._id});
  db[res_name].drop();
  return results;
}

search(query);