stevenbeales
12/27/2018 - 5:45 PM

Tokenize Yelp

# -*- coding: utf-8 -*-
"""Script to tokenize yelp reviews with spaCy.

Input
=====
We use the review file in the yelp academic dataset
for this gist See https://www.yelp.com/dataset_challenge.

``` python

    {
        u'business_id': u'2aFiy99vNLklCx3T_tGS9A',
        u'cool': 0,
        u'date': u'2011-10-10',
        u'funny': 0,
        u'review_id': u'NxL8SIC5yqOdnlXCg18IBg',
        u'stars': 5,
        u'text': u"If you enjoy service by someone who is as...",
        u'type': u'review',
        u'useful': 0,
        u'user_id': u'KpkOkG6RIf4Ra25Lhhxf1A',
    }
```
Output
======
A pandas Dataframe where each row corresponds to single
sentence of a review.
                   review_id  sent_num                                                                      tokens
32621  B8xpcb3VRV8BtJ8YM17_HQ         3  [Nobody, pushed, me, to, get, gel, color, ,, and, I, 've, found, some, ...
20817  RKTKuMOxsimvWRfI9p-J0g         0  [This, is, a, default, corporate, watering, hole, for, the, nearby, off...
"""
from __future__ import division
from __future__ import unicode_literals

import simplejson as json
import codecs

import pandas as pd # optional
import tqdm # optional
import spacy

from itertools import izip
from pprint import pprint


# Path to the reviews file of the yelp's academic dataset.
REVIEWS_FILE = 'path/to/file'


def load_data(path, test_frac=0.3, n_reviews=None):
    sent_tokens_list = []
    nlp = spacy.load('en')
    # We only want to use the parser for this task. This
    # let's us skip POS-tagging and NER detection.
    nlp.pipeline = [nlp.parser]
    return pd.DataFrame(
        iter_file_sent_tokens(
            path=paths.reviews,
            nlp=nlp,
            n_reviews=n_reviews,
        ),
    )


def iter_file_sent_tokens(path, nlp, n_reviews=None):
    """Iterate over the review file and yield sentence tokens.
    
    :param path: Path to the reviews file.
    :type path: str
    :param nlp: Spacy tokenizer with parsing.
    :type nlp: spacy.en.English
    :param n_reviews: Maximum number of reviews to iter over. If
        None we iter over all reviews.
    :type n_reviews: int or NoneType
    
    :returns: Iterator over a dict. See sample output.
    :rtype: iter(dict)
    
    Sample output::
    
        {'review_id': 'iamid', 'sent_num': 0, 'tokens': ['Every', 'villain', 'is', 'lemons']}
    """
    for review_dict, review_doc in izip(
        iter_review_dict(path=path, n_reviews=n_reviews),
        nlp.pipe(
            iter_review_text(path=path, n_reviews=n_reviews),
            batch_size=2500,
            n_threads=4,
        ),
    ):
        for sent_num, sent in enumerate(review_doc.sents):
            yield {
                'sent_num': sent_num,
                'tokens': map(unicode, sent),
                'review_id': review_dict['review_id'],
            }


def iter_review_dict(path, n_reviews=None):
    """Iterate over the review file loading the json data into a dict."""
    with codecs.open(path, encoding='utf-8') as infile:
        for i, line in tqdm.tqdm(enumerate(infile), total=n_reviews):
            if n_reviews and n_reviews == i:
                return
            yield json.loads(line)


def iter_review_text(path, n_reviews=None):
    """Iterate over the review text in the review file."""
    for review_dict in iter_review_dict(path=path, n_reviews=n_reviews):
        yield review_dict['text']
        
df = load_data(
    path=REVIEWS_FILE,
    test_frac=0.3,
    n_reviews=10000,
)
print df.head(2)
#                    review_id  sent_num                                                                      tokens
# 32621  B8xpcb3VRV8BtJ8YM17_HQ         3  [Nobody, pushed, me, to, get, gel, color, ,, and, I, 've, found, some, ...
# 20817  RKTKuMOxsimvWRfI9p-J0g         0  [This, is, a, default, corporate, watering, hole, for, the, nearby, off...