Do mapreduce in python with mrjob

6/16/2016 - 9:14 AM

Do mapreduce in python with mrjob

from mrjob.step import MRStep
from mrjob.job import MRJob

class MostPopularMovie(MRJob):

    def configure_options(self):
        super(MostPopularMovie, self).configure_options()
        self.add_file_option('--items', help='Path to u.item')


    def steps(self):
        return [MRStep(mapper=self.mapper_count,reducer_init=self.reducer_init,reducer=self.reducer_count),
        MRStep(reducer=self.reducer_most)]

    def mapper_count(self, _, line):
	    rating=line.split("\t")
	    yield rating[1],1
    	

    def reducer_init(self):
        self.movieNames = {}
        with open("u.item") as f:
            for line in f:
                fields = line.split('|')
                self.movieNames[fields[0]] = unicode(fields[1], errors='ignore')


    def reducer_count(self, movie, count_sum):
	    yield "most",[sum(count_sum),self.movieNames[movie]]
    	

    def reducer_most(self,_,freq):
        yield "most",max(freq)

if __name__ == '__main__':
    MostPopularMovie.run()

mrjob_ex.py

from mrjob.job import MRJob

class MRTextInfo(MRJob):
    def mapper(self, _, line):
        for phrase in line.split('.'):
            yield 'phrases', 1
            for word in phrase.split():
                yield 'words', 1
                yield 'characters', len(word)

    def reducer(self, key, counts):
        yield key, sum(counts)


if __name__ == '__main__':
    MRTextInfo.run()

Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

Do mapreduce in python with mrjob