Over a million developers have joined DZone.
{{announcement.body}}
{{announcement.title}}

Use MapReduce for Search Ranking in a Node.js and MongoDB Environment

DZone's Guide to

Use MapReduce for Search Ranking in a Node.js and MongoDB Environment

There's a lot of statistical analysis that goes into ranking search results accurately. This tutorial shows you how MapReduce with Node.js and MongoDB can help.

Free Resource

Learn how you can maximize big data in the cloud with Apache Hadoop. Download this eBook now. Brought to you in partnership with Hortonworks.

Use MapReduce for Tf-Idf Ranking on a Node.js and MongoDB Environment

When developing a document search application one of the challenges is to order your results according to the occurrence of the term that you search for. Tf-Idf is a numerical statistic that assists you in weighing the results of you search.

Tf stands for term frequency.

Idf stands for Inverse document frequency.

To get a grasp we will develop a sample of tf-idf in javascript, as a node module.

function TfIdf() {
}

TfIdf.prototype.weights = function(documents,term) {

    var results = []

    var idf = this.idf(documents,term)

    for(var i=0;i<documents.length;i++) {

        var tf = this.tf(documents[i],term)
        var tfidf = tf*idf
        var result = {weight:tfidf,doc:documents[i]}    

        results.push(result)
    }

    return results
}

TfIdf.prototype.tf = function(words,term) {

    var result = 0

    for(var i=0;i<words.length;i++) {

        var word = words[i]

        if(word.indexOf(term)!=-1) {
            result = result+1
        }    
    }

    return result/words.length
}

TfIdf.prototype.idf = function(documents,term) {

    var occurence = 0

    for(var j=0;j<documents.length;j++) {

        var doc = documents[j]

        if(this.__wordInsideDoc(doc,term)){
            occurence = occurence+1
        }                  
    }

    if(occurence==0) {
        return undefined    
    }

    return Math.log(documents.length/occurence)
}

TfIdf.prototype.__wordInsideDoc = function(doc,term) {

    for(var i=0;i<doc.length;i++) {

        var word = doc[i]

        if(word.indexOf(term)!=-1) {
            return true
        }
    }    

    return false
}

module.exports = TfIdf

The function weights will accept the documents and term to search.

An example follows

var TfIdf = require('./TfIdf')

var tfIdf = new TfIdf()

var docs = [["latest","sprint"],["lair","laugh","fault"],["lemma","on"]]

console.log(tfIdf.weights(docs,"la"))

The result is:


[ { weight: 0.2027325540540822, doc: [ 'latest', 'sprint' ] },
  { weight: 0.27031007207210955,
    doc: [ 'lair', 'laugh', 'fault' ] },
  { weight: 0, doc: [ 'lemma', 'on' ] } ]

Now we shall proceed with the MapReduce approach.

I will use node.js.

First we will install the MongoDB driver:

npm install mongodb

Then we will setup our Mongo database connection. Once initialized, in case there are no records, we will populate the database for testing purposes.

This is gonna be a two phase process.

On the first phase we have to calculate the idf.

To do so we will issue a MapReduce.

The term variable has to be passed in order to be used by the MapReduce process.

In order to use a dynamic variable on MapReduce we will employee the scope parameter.

TfIdfMongo.prototype.__idf = function(connection,term,callback) {

    var tfIdfMongo = this

    var documents = connection.collection('documents');

    documents.mapReduce(
        tfIdfMongo.__mapIdf,
        tfIdfMongo.__reduceIdf,
        {
            scope: {permterm:term},
            out: "tfidf_results"
        },
        function(err,results) {

            if(err) {
                callback(err)
            }

            results.findOne({},function(err,result) {

                if(err) {
                    callback(err)
                }

                if(result.value.occurrence==0) {
                    return;
                }

                var idf = Math.log(result.value.count/result.value.occurrence)

                callback(undefined,idf)
            })
        }
    )
}

TfIdfMongo.prototype.__mapIdf = function() {

    var term = permterm

    var occurrence = 0

    for (var i = 0; i < this.words.length; i++) {

        var word = this.words[i]

        if (word.indexOf(term) != -1) {

            if (occurrence <=0 ) {

                occurrence = 1
            }
        }
    }

     emit("idf", occurrence)
}

TfIdfMongo.prototype.__reduceIdf = function(key,values) {

    var result = {count:values.length,occurrence:0}

    for(var i=0;i<values.length;i++) {

        if(values[i]==1) {
            result.occurrence += 1
        }
    }

    return result
}

The result is one number.

On the second phase we have to calculate the tf for each document and multiply the result with the idf value calculated prior to this.

MapReduce will be used for this case too.

This time through the scope parameter, we are going to pass the term that we search for but also the idf variable.

TfIdfMongo.prototype.__tf = function(connection,term,idf,callback) {

    var tfIdfMongo = this

    var documents = connection.collection('documents');

    documents.mapReduce(
        tfIdfMongo.__mapTf,
        function(key,values) {

            return values
        },
        {
            scope: {permTerm:term,permIdf:idf},
            out: "tf_results"
        },
        function(err,results) {

            if(err) {
                callback(err)
            }

            results.find({},function(err,docs) {

                if(err) {
                    callback(err)
                }

                docs.toArray(function (err,documents) {
                    callback(err,documents)
                })
            })
        }
    )
}

TfIdfMongo.prototype.__mapTf = function() {

    var term = permTerm
    var idf = permIdf

    var occurrence = 0

    for(var i=0;i<this.words.length;i++) {

        var word = this.words[i]
        if (word.indexOf(term) != -1) {

            occurrence += 1
        }
    }

    var weight = idf*(occurrence/this.words.length)

    emit(this, weight)
}

We will implement the tfIdf function which combines the two previous steps.

The function takes the term that we need to search for as an argument.

var MongoClient = require('mongodb').MongoClient
Server = require('mongodb').Server

var url = 'mongodb://localhost:27017/mapreduceexample'

function TfIdfMongo() {
}

TfIdfMongo.prototype.tfIdf = function(term,callback) {

    var tfIdfMongo = this

    tfIdfMongo.__getConnection(function(err,connection) {

        if(err) {
            callback(err)
        }

        tfIdfMongo.__idf(connection,term,function(err,idf) {

            if(err) {
                callback(err)
            }

            tfIdfMongo.__tf(connection,term,idf,function(err,documents) {

                if(err) {
                    callback(err)
                }

                connection.close()

                callback(undefined,documents)

            })

        })
    })
}

TfIdfMongo.prototype.__getConnection = function(callback) {

    var tfIdfMongo = this

    MongoClient.connect(url,function (err, connection) {
        if (err) {
            callback(err)
        } else {

            var documents = connection.collection('documents');

            documents.count({}, function (error, numOfDocs) {
                if (numOfDocs == 0) {
                    tfIdfMongo.__insertTestRecords(connection,function(err) {
                        callback(err,connection)
                    })
                } else {
                    callback(undefined,connection)
                }
            })
        }
    })
}

TfIdfMongo.prototype.__insertTestRecords = function(connection,callback) {

    var documents = connection.collection('documents');

    var latestDocuments = [
        {words:["latest","sprint"]},
        {words:["lair","laugh","fault"]},
        {words:["lemma","on"]}
    ]

    documents.insert(latestDocuments,
        function(err,result) {
            callback(err)
        })

}

TfIdfMongo.prototype.__tf = function(connection,term,idf,callback) {

    var tfIdfMongo = this

    var documents = connection.collection('documents');

    documents.mapReduce(
        tfIdfMongo.__mapTf,
        function(key,values) {

            return values
        },
        {
            scope: {permTerm:term,permIdf:idf},
            out: "tf_results"
        },
        function(err,results) {

            if(err) {
                callback(err)
            }

            results.find({},function(err,docs) {

                if(err) {
                    callback(err)
                }

                docs.toArray(function (err,documents) {
                    callback(err,documents)
                })
            })
        }
    )
}

TfIdfMongo.prototype.__mapTf = function() {

    var term = permTerm
    var idf = permIdf

    var occurrence = 0

    for(var i=0;i<this.words.length;i++) {

        var word = this.words[i]
        if (word.indexOf(term) != -1) {

            occurrence += 1
        }
    }

    var weight = idf*(occurrence/this.words.length)

    emit(this, weight)
}


TfIdfMongo.prototype.__idf = function(connection,term,callback) {

    var tfIdfMongo = this

    var documents = connection.collection('documents');

    documents.mapReduce(
        tfIdfMongo.__mapIdf,
        tfIdfMongo.__reduceIdf,
        {
            scope: {permterm:term},
            out: "tfidf_results"
        },
        function(err,results) {

            if(err) {
                callback(err)
            }

            results.findOne({},function(err,result) {

                if(err) {
                    callback(err)
                }

                if(result.value.occurrence==0) {
                    return;
                }

                var idf = Math.log(result.value.count/result.value.occurrence)

                callback(undefined,idf)
            })
        }
    )
}

TfIdfMongo.prototype.__mapIdf = function() {

    var term = permterm

    var occurrence = 0

    for (var i = 0; i < this.words.length; i++) {

        var word = this.words[i]

        if (word.indexOf(term) != -1) {

            if (occurrence <=0 ) {

                occurrence = 1
            }
        }
    }

     emit(this.__id, occurrence)
}

TfIdfMongo.prototype.__reduceIdf = function(key,values) {

    var result = {count:values.length,occurrence:0}

    for(var i=0;i<values.length;i++) {

        if(values[i]==1) {
            result.occurrence += 1
        }
    }

    return result
}



module.exports = TfIdfMongo

Our test show case:

var TfIdf = require('./TfIdf')
var TfIdfMongo = require('./TfIdfMongo')

var tfIdf = new TfIdf()

var docs = [["latest","sprint"],["lair","laugh","fault"],["lemma","on"]]


console.log("The results are "+JSON.stringify(tfIdf.tfIdf(docs,"la")))

var tfIdfMongo = new TfIdfMongo()

tfIdfMongo.tfIdf("la",function(err,results) {


    console.log("The results are "+JSON.stringify(results))

})

And we get the same results for both cases.

The results are [{"weight":0.2027325540540822,"doc":["latest","sprint"]},{"weight":0.27031007207210955,"doc":["lair","laugh","fault"]},{"weight":0,"doc":["lemma","on"]}]
The results are [{"_id":{"_id":"55f46602947446bb1a7f7933","words":["latest","sprint"]},"value":0.2027325540540822},{"_id":{"_id":"55f46602947446bb1a7f7934","words":["lair","laugh","fault"]},"value":0.27031007207210955},{"_id":{"_id":"55f46602947446bb1a7f7935","words":["lemma","on"]},"value":0}]

Why Should I Use MapReduce for This problem?

The tf-idf ranking problem, is a problem which includes computations, that can be parallelised.
The sequential approach could be an option for other environments but for Node.js there are many drawbacks.

Node.js is a single threaded environment, it was not designed for heavy computational tasks.

Its magic has to do with how good it executes I/O operations.

Consider the scenario of a large dataset problem.

While the Node.js process would be executing the time consuming computations, the requests issued won’t be able to be executed appropriately.

However there are some workarounds for solutions based on Node.js, such as spawning extra nodes and implement a way of communication between them.

To Sum Up

MapReduce fits well to the ranking problem. Not only it takes away much of the computational overhead but also from the implementation overhead.

Hortonworks DataFlow is an integrated platform that makes data ingestion fast, easy, and secure. Download the white paper now.  Brought to you in partnership with Hortonworks

Topics:
node.js ,mongodb ,javascript

Published at DZone with permission of Emmanouil Gkatziouras, DZone MVB. See the original article here.

Opinions expressed by DZone contributors are their own.

The best of DZone straight to your inbox.

SEE AN EXAMPLE
Please provide a valid email address.

Thanks for subscribing!

Awesome! Check your inbox to verify your email so you can start receiving the latest in tech news and resources.
Subscribe

{{ parent.title || parent.header.title}}

{{ parent.tldr }}

{{ parent.urlSource.name }}