zr777
5/8/2017 - 1:47 PM

From https://github.com/ptwobrussell/Mining-the-Social-Web-2nd-Edition/blob/master/ipynb/Chapter%204%20-%20Mining%20Google%2B.ipynb

# https://github.com/ptwobrussell/Mining-the-Social-Web-2nd-Edition/blob/master/ipynb/Chapter%204%20-%20Mining%20Google%2B.ipynb

# -------------------------------- 1.-------------------------------
# 查询Google+上的用户
import httplib2 # pip install httplib2
import json
import apiclient.discovery # pip install google-api-python-client

# XXX: Enter any person's name
Q = "Tim O'Reilly"

# XXX: Enter in your API key from  https://code.google.com/apis/console
API_KEY = 'AIzaSyCn0d6t291r2IZqqsXf-fG1DabYpwEspp4' 

service = apiclient.discovery.build('plus', 'v1', http=httplib2.Http(), 
                                    developerKey=API_KEY)

people_feed = service.people().search(query=Q).execute()

print(json.dumps(people_feed['items'], indent=1))


# -------------------------------- 2.-------------------------------
# 在IpythonNotebook上用html展示获取的用户数据
from IPython.core.display import HTML

html = []

for p in people_feed['items']:
    html += ['<p><img src="%s" /> %s: %s</p>' % \
             (p['image']['url'], p['id'], p['displayName'])]

HTML(''.join(html))


# -------------------------------- 3.-------------------------------
# 根据UserID查询特定用户的公开动态
import httplib2
import json
import apiclient.discovery

USER_ID = '107033731246200681024' # Tim O'Reilly

service = apiclient.discovery.build('plus', 'v1', http=httplib2.Http(), 
                                    developerKey=API_KEY)

activity_feed = service.activities().list(
  userId=USER_ID,
  collection='public',
  maxResults='100' # Max allowed per API
).execute()

print(json.dumps(activity_feed, indent=1))


# -------------------------------- 4.-------------------------------
# 获取某一个动态的具体内容并清洗数据
from bs4 import BeautifulSoup

# removes tags and converts HTML entities
def cleanHtml(html):
  if html == "": return ""

  return BeautifulSoup(html, 'lxml').get_text()

html_ = activity_feed['items'][0]['object']['content']
print(html_)
print(cleanHtml(html_))

# -------------------------------- 5.-------------------------------
# 循环获取公开动态直到满足数量需求
import os

MAX_RESULTS = 200

activity_feed = service.activities().list(
  userId=USER_ID,
  collection='public',
  maxResults='100' # 每次请求的允许的最大项目数
)

activity_results = []

while activity_feed != None and len(activity_results) < MAX_RESULTS:

  activities = activity_feed.execute()

  if 'items' in activities:

    for activity in activities['items']:

      if activity['object']['objectType'] == 'note' and \
         activity['object']['content'] != '':
		# 清洗数据
        activity['title'] = cleanHtml(activity['title'])
        activity['object']['content'] = cleanHtml(activity['object']['content'])
        activity_results += [activity]

  # list_next函数需要先前的请求和响应对象
  activity_feed = service.activities().list_next(activity_feed, activities)

  
# 输出到文件
if not os.path.exists('google-plus'):
	os.mkdir('google-plus')
f = open(os.path.join('google-plus', USER_ID + '.json'), 'w')
f.write(json.dumps(activity_results, indent=1))
f.close()
print(str(len(activity_results)), "activities written to", f.name)


# -------------------------------- 6.-------------------------------
# nltk语言分析的基本应用
import nltk

# 下载nltk辅助包
nltk.download('stopwords')

# 合并所有动态内容
all_content = " ".join([ a['object']['content'] for a in activity_results ])
print(len(all_content))

# 分词并初始化文本对象
tokens = all_content.split()
text = nltk.Text(tokens)

# 展示10条窗口为79的含"open"的小语段
text.concordance("open", width=79, lines=10)

# 展示文本中的高频搭配
text.collocations(num=20, window_size=2)

# 词汇频率分析
fdist = text.vocab()
fdist["open"]
fdist["source"]
fdist["web"]
fdist["2.0"]

# 文本的总单词数
len(tokens)

# 不重复的单词数
len(fdist.keys())

# 文本中非停用词的常用词
[w for w,f in fdist.most_common(100) \
   if w.lower() not in nltk.corpus.stopwords.words('english')]

# 非URL的长词
[w for w in fdist.keys() if len(w) > 15 and not w.startswith("http")]

# URLs的数量
len([w for w in fdist.keys() if w.startswith("http")])

# 词频分布
for rank, word in enumerate(fdist): 
    print(rank, word, fdist[word])

# -------------------------------- 7.-------------------------------
# 由关键词查询Google+上最相似的动态
DATA = 'google-plus/107033731246200681024.json'
data = json.loads(open(DATA).read())

# 输入你的查询词
QUERY_TERMS = ['elon', 'happy']

# 构建文档列表(每条文档已分词)
activities = [activity['object']['content'].lower().split() \
              for activity in data \
                if activity['object']['content'] != ""]

# TextCollection提供了tf_idf封装接口
tc = nltk.TextCollection(activities)
relevant_activities = []

# 遍历所有文档
for idx in range(len(activities)):
    score = 0
	# 遍历所有查询关键词,累加tfidf得分
    for term in [t.lower() for t in QUERY_TERMS]:
        score += tc.tf_idf(term, activities[idx])
    if score > 0:
        relevant_activities.append({'score': score, 'title': data[idx]['title'],
                              'url': data[idx]['url']})

# 按得分排序并展示
relevant_activities = sorted(relevant_activities, 
                             key=lambda p: p['score'], reverse=True)
for activity in relevant_activities:
    print(activity['title'])
    print('\tLink: %s' % activity['url'])
    print('\tScore: %s \n' % activity['score'])

	
# -------------------------------- 8.-------------------------------
# 比较两两动态之间的相似度
# 只考虑大于1000词的内容
data = [ post for post in json.loads(open(DATA).read())
         if len(post['object']['content']) > 1000 ]

all_posts = [post['object']['content'].lower().split() 
             for post in data ]

# 同上,提供tf_idf接口
tc = nltk.TextCollection(all_posts)

# 计算一个词语文档矩阵,使得td_matrix[(doc_title, url)][term]返回文档中该term的tf-idf分数
td_matrix = {}
for idx in range(len(all_posts)):
    post = all_posts[idx]
    fdist = nltk.FreqDist(post)

    doc_title = data[idx]['title']
    url = data[idx]['url']
    td_matrix[(doc_title, url)] = {}

    for term in fdist.keys():
        td_matrix[(doc_title, url)][term] = tc.tf_idf(term, post)

# 构建向量,使得term的得分位于同一位置
distances = {}
for (title1, url1) in td_matrix.keys():

    distances[(title1, url1)] = {}
    (min_dist, most_similar) = (1.0, ('', ''))

    for (title2, url2) in td_matrix.keys():
        
        if url1 == url2:
            # 和自己比较则返回
            continue

        # 小心不要改变原始数据,因为后面的循环还继续需要原始数据
        terms1 = td_matrix[(title1, url1)].copy()
        terms2 = td_matrix[(title2, url2)].copy()

        # 两个文档互相填补对方的空缺词汇,达到相同的映射关系与长度
        for term1 in terms1:
            if term1 not in terms2:
                terms2[term1] = 0
        for term2 in terms2:
            if term2 not in terms1:
                terms1[term2] = 0

        # 由文档的词汇映射创建向量
        v1 = [score for (term, score) in sorted(terms1.items())]
        v2 = [score for (term, score) in sorted(terms2.items())]

        # 计算两个文档的相似度
        distances[(title1, url1)][(title2, url2)] = \
            nltk.cluster.util.cosine_distance(v1, v2)

        if distances[(title1, url1)][(title2, url2)] < min_dist:
            (min_dist, most_similar) = (distances[(title1, url1)][(title2,
                                         url2)], (title2, url2))
    
    print('''Most similar to %s (%s)
-->>\t%s (%s)
-->>\tscore %f
----------------
''' % (title1.replace('\n', ''), url1,
            most_similar[0].replace('\n', ''), most_similar[1], 1-min_dist))


# -------------------------------- 9.-------------------------------
# 图形展示两两动态之间的相似度,类似8
# 只考虑大于1000词的内容
data = [ post for post in json.loads(open(DATA).read())
         if len(post['object']['content']) > 1000 ]

all_posts = [post['object']['content'].lower().split() 
             for post in data ]

# 同上,提供tf_idf接口
tc = nltk.TextCollection(all_posts)

# 计算一个词语文档矩阵,使得td_matrix[(doc_title, url)][term]返回文档中该term的tf-idf分数
td_matrix = {}
for idx in range(len(all_posts)):
    post = all_posts[idx]
    fdist = nltk.FreqDist(post)

    doc_title = data[idx]['title']
    url = data[idx]['url']
    td_matrix[(doc_title, url)] = {}

    for term in fdist.keys():
        td_matrix[(doc_title, url)][term] = tc.tf_idf(term, post)
        
# 构建节点映射关系
viz_links = []
viz_nodes = [ {'title': title, 'url': url, 'idx': idx} for idx, (title, url) in enumerate(td_matrix.keys()) ]
idx = {vn['title']: vn['idx'] for vn in viz_nodes }

# 构建向量,使得term的得分位于同一位置
distances = {}
for (title1, url1) in td_matrix.keys():

    distances[(title1, url1)] = {}
    (min_dist, most_similar) = (1.0, ('', ''))

    for (title2, url2) in td_matrix.keys():
        
        if url1 == url2:
            # 和自己比较则返回
            continue

        # 小心不要改变原始数据,因为后面的循环还继续需要原始数据
        terms1 = td_matrix[(title1, url1)].copy()
        terms2 = td_matrix[(title2, url2)].copy()

        # 两个文档互相填补对方的空缺词汇,达到相同的映射关系与长度
        for term1 in terms1:
            if term1 not in terms2:
                terms2[term1] = 0
        for term2 in terms2:
            if term2 not in terms1:
                terms1[term2] = 0

        # 由文档的词汇映射创建向量
        v1 = [score for (term, score) in sorted(terms1.items())]
        v2 = [score for (term, score) in sorted(terms2.items())]

        # 计算两个文档的相似度
        distances[(title1, url1)][(title2, url2)] = \
            nltk.cluster.util.cosine_distance(v1, v2)

        if distances[(title1, url1)][(title2, url2)] < min_dist:
            (min_dist, most_similar) = (distances[(title1, url1)][(title2,
                                         url2)], (title2, url2))
    # 只记录最大相似的两个文档
    viz_links.append({'source' : idx[title1], 'target' : idx[most_similar[0]], 'score' : 1 - min_dist})
        
f = open('google-plus/matrix.json', 'w')
f.write(json.dumps({'nodes' : viz_nodes, 'links' : viz_links}, indent=1))
f.close()

# html:
# curl -O https://raw.githubusercontent.com/ptwobrussell/Mining-the-Social-Web-2nd-Edition/master/ipynb/resources/ch04-googleplus/viz/matrix.html
# css:
# curl -O https://raw.githubusercontent.com/ptwobrussell/Mining-the-Social-Web-2nd-Edition/blob/master/ipynb/resources/ch04-googleplus/viz/style.css
# original beautiful page:
# http://bost.ocks.org/mike/miserables/
# python -m http.server 9000
# http://localhost:9000/matrix.html


# -------------------------------- 10.-------------------------------
# 关于词语搭配, 两两搭配(bigrams)n个词有n-1个组合
# ["Mr.", "Green", "killed", "Colonel", "Mustard"] 
# ->[("Mr.", "Green"), ("Green", "killed"), ("killed", "Colonel"), ("Colonel", "Mustard")]
import nltk

sentence = "Mr. Green killed Colonel Mustard in the study with the " + \
           "candlestick. Mr. Green is not a very nice fellow."

print(list(nltk.ngrams(sentence.split(), 2)))
txt = nltk.Text(sentence.split())

txt.collocations()

# -------------------------------- 11.-------------------------------
# txt.collocations()只是打印输出没有返回值,根据源码可以很方便地仿制
# nltk\text.py

N = 25  # 寻找的搭配数量

all_tokens = [token for activity in data for token in activity['object']['content'
              ].lower().split()]

finder = nltk.BigramCollocationFinder.from_words(all_tokens)
finder.apply_freq_filter(2)
finder.apply_word_filter(lambda w: w in nltk.corpus.stopwords.words('english'))
scorer = nltk.BigramAssocMeasures.jaccard
collocations = finder.nbest(scorer, N)

for collocation in collocations:
    c = ' '.join(collocation)
    print(c)