# https://github.com/ptwobrussell/Mining-the-Social-Web-2nd-Edition/blob/master/ipynb/Chapter%204%20-%20Mining%20Google%2B.ipynb
# -------------------------------- 1.-------------------------------
# 查询Google+上的用户
import httplib2 # pip install httplib2
import json
import apiclient.discovery # pip install google-api-python-client
# XXX: Enter any person's name
Q = "Tim O'Reilly"
# XXX: Enter in your API key from https://code.google.com/apis/console
API_KEY = 'AIzaSyCn0d6t291r2IZqqsXf-fG1DabYpwEspp4'
service = apiclient.discovery.build('plus', 'v1', http=httplib2.Http(),
developerKey=API_KEY)
people_feed = service.people().search(query=Q).execute()
print(json.dumps(people_feed['items'], indent=1))
# -------------------------------- 2.-------------------------------
# 在IpythonNotebook上用html展示获取的用户数据
from IPython.core.display import HTML
html = []
for p in people_feed['items']:
html += ['<p><img src="%s" /> %s: %s</p>' % \
(p['image']['url'], p['id'], p['displayName'])]
HTML(''.join(html))
# -------------------------------- 3.-------------------------------
# 根据UserID查询特定用户的公开动态
import httplib2
import json
import apiclient.discovery
USER_ID = '107033731246200681024' # Tim O'Reilly
service = apiclient.discovery.build('plus', 'v1', http=httplib2.Http(),
developerKey=API_KEY)
activity_feed = service.activities().list(
userId=USER_ID,
collection='public',
maxResults='100' # Max allowed per API
).execute()
print(json.dumps(activity_feed, indent=1))
# -------------------------------- 4.-------------------------------
# 获取某一个动态的具体内容并清洗数据
from bs4 import BeautifulSoup
# removes tags and converts HTML entities
def cleanHtml(html):
if html == "": return ""
return BeautifulSoup(html, 'lxml').get_text()
html_ = activity_feed['items'][0]['object']['content']
print(html_)
print(cleanHtml(html_))
# -------------------------------- 5.-------------------------------
# 循环获取公开动态直到满足数量需求
import os
MAX_RESULTS = 200
activity_feed = service.activities().list(
userId=USER_ID,
collection='public',
maxResults='100' # 每次请求的允许的最大项目数
)
activity_results = []
while activity_feed != None and len(activity_results) < MAX_RESULTS:
activities = activity_feed.execute()
if 'items' in activities:
for activity in activities['items']:
if activity['object']['objectType'] == 'note' and \
activity['object']['content'] != '':
# 清洗数据
activity['title'] = cleanHtml(activity['title'])
activity['object']['content'] = cleanHtml(activity['object']['content'])
activity_results += [activity]
# list_next函数需要先前的请求和响应对象
activity_feed = service.activities().list_next(activity_feed, activities)
# 输出到文件
if not os.path.exists('google-plus'):
os.mkdir('google-plus')
f = open(os.path.join('google-plus', USER_ID + '.json'), 'w')
f.write(json.dumps(activity_results, indent=1))
f.close()
print(str(len(activity_results)), "activities written to", f.name)
# -------------------------------- 6.-------------------------------
# nltk语言分析的基本应用
import nltk
# 下载nltk辅助包
nltk.download('stopwords')
# 合并所有动态内容
all_content = " ".join([ a['object']['content'] for a in activity_results ])
print(len(all_content))
# 分词并初始化文本对象
tokens = all_content.split()
text = nltk.Text(tokens)
# 展示10条窗口为79的含"open"的小语段
text.concordance("open", width=79, lines=10)
# 展示文本中的高频搭配
text.collocations(num=20, window_size=2)
# 词汇频率分析
fdist = text.vocab()
fdist["open"]
fdist["source"]
fdist["web"]
fdist["2.0"]
# 文本的总单词数
len(tokens)
# 不重复的单词数
len(fdist.keys())
# 文本中非停用词的常用词
[w for w,f in fdist.most_common(100) \
if w.lower() not in nltk.corpus.stopwords.words('english')]
# 非URL的长词
[w for w in fdist.keys() if len(w) > 15 and not w.startswith("http")]
# URLs的数量
len([w for w in fdist.keys() if w.startswith("http")])
# 词频分布
for rank, word in enumerate(fdist):
print(rank, word, fdist[word])
# -------------------------------- 7.-------------------------------
# 由关键词查询Google+上最相似的动态
DATA = 'google-plus/107033731246200681024.json'
data = json.loads(open(DATA).read())
# 输入你的查询词
QUERY_TERMS = ['elon', 'happy']
# 构建文档列表(每条文档已分词)
activities = [activity['object']['content'].lower().split() \
for activity in data \
if activity['object']['content'] != ""]
# TextCollection提供了tf_idf封装接口
tc = nltk.TextCollection(activities)
relevant_activities = []
# 遍历所有文档
for idx in range(len(activities)):
score = 0
# 遍历所有查询关键词,累加tfidf得分
for term in [t.lower() for t in QUERY_TERMS]:
score += tc.tf_idf(term, activities[idx])
if score > 0:
relevant_activities.append({'score': score, 'title': data[idx]['title'],
'url': data[idx]['url']})
# 按得分排序并展示
relevant_activities = sorted(relevant_activities,
key=lambda p: p['score'], reverse=True)
for activity in relevant_activities:
print(activity['title'])
print('\tLink: %s' % activity['url'])
print('\tScore: %s \n' % activity['score'])
# -------------------------------- 8.-------------------------------
# 比较两两动态之间的相似度
# 只考虑大于1000词的内容
data = [ post for post in json.loads(open(DATA).read())
if len(post['object']['content']) > 1000 ]
all_posts = [post['object']['content'].lower().split()
for post in data ]
# 同上,提供tf_idf接口
tc = nltk.TextCollection(all_posts)
# 计算一个词语文档矩阵,使得td_matrix[(doc_title, url)][term]返回文档中该term的tf-idf分数
td_matrix = {}
for idx in range(len(all_posts)):
post = all_posts[idx]
fdist = nltk.FreqDist(post)
doc_title = data[idx]['title']
url = data[idx]['url']
td_matrix[(doc_title, url)] = {}
for term in fdist.keys():
td_matrix[(doc_title, url)][term] = tc.tf_idf(term, post)
# 构建向量,使得term的得分位于同一位置
distances = {}
for (title1, url1) in td_matrix.keys():
distances[(title1, url1)] = {}
(min_dist, most_similar) = (1.0, ('', ''))
for (title2, url2) in td_matrix.keys():
if url1 == url2:
# 和自己比较则返回
continue
# 小心不要改变原始数据,因为后面的循环还继续需要原始数据
terms1 = td_matrix[(title1, url1)].copy()
terms2 = td_matrix[(title2, url2)].copy()
# 两个文档互相填补对方的空缺词汇,达到相同的映射关系与长度
for term1 in terms1:
if term1 not in terms2:
terms2[term1] = 0
for term2 in terms2:
if term2 not in terms1:
terms1[term2] = 0
# 由文档的词汇映射创建向量
v1 = [score for (term, score) in sorted(terms1.items())]
v2 = [score for (term, score) in sorted(terms2.items())]
# 计算两个文档的相似度
distances[(title1, url1)][(title2, url2)] = \
nltk.cluster.util.cosine_distance(v1, v2)
if distances[(title1, url1)][(title2, url2)] < min_dist:
(min_dist, most_similar) = (distances[(title1, url1)][(title2,
url2)], (title2, url2))
print('''Most similar to %s (%s)
-->>\t%s (%s)
-->>\tscore %f
----------------
''' % (title1.replace('\n', ''), url1,
most_similar[0].replace('\n', ''), most_similar[1], 1-min_dist))
# -------------------------------- 9.-------------------------------
# 图形展示两两动态之间的相似度,类似8
# 只考虑大于1000词的内容
data = [ post for post in json.loads(open(DATA).read())
if len(post['object']['content']) > 1000 ]
all_posts = [post['object']['content'].lower().split()
for post in data ]
# 同上,提供tf_idf接口
tc = nltk.TextCollection(all_posts)
# 计算一个词语文档矩阵,使得td_matrix[(doc_title, url)][term]返回文档中该term的tf-idf分数
td_matrix = {}
for idx in range(len(all_posts)):
post = all_posts[idx]
fdist = nltk.FreqDist(post)
doc_title = data[idx]['title']
url = data[idx]['url']
td_matrix[(doc_title, url)] = {}
for term in fdist.keys():
td_matrix[(doc_title, url)][term] = tc.tf_idf(term, post)
# 构建节点映射关系
viz_links = []
viz_nodes = [ {'title': title, 'url': url, 'idx': idx} for idx, (title, url) in enumerate(td_matrix.keys()) ]
idx = {vn['title']: vn['idx'] for vn in viz_nodes }
# 构建向量,使得term的得分位于同一位置
distances = {}
for (title1, url1) in td_matrix.keys():
distances[(title1, url1)] = {}
(min_dist, most_similar) = (1.0, ('', ''))
for (title2, url2) in td_matrix.keys():
if url1 == url2:
# 和自己比较则返回
continue
# 小心不要改变原始数据,因为后面的循环还继续需要原始数据
terms1 = td_matrix[(title1, url1)].copy()
terms2 = td_matrix[(title2, url2)].copy()
# 两个文档互相填补对方的空缺词汇,达到相同的映射关系与长度
for term1 in terms1:
if term1 not in terms2:
terms2[term1] = 0
for term2 in terms2:
if term2 not in terms1:
terms1[term2] = 0
# 由文档的词汇映射创建向量
v1 = [score for (term, score) in sorted(terms1.items())]
v2 = [score for (term, score) in sorted(terms2.items())]
# 计算两个文档的相似度
distances[(title1, url1)][(title2, url2)] = \
nltk.cluster.util.cosine_distance(v1, v2)
if distances[(title1, url1)][(title2, url2)] < min_dist:
(min_dist, most_similar) = (distances[(title1, url1)][(title2,
url2)], (title2, url2))
# 只记录最大相似的两个文档
viz_links.append({'source' : idx[title1], 'target' : idx[most_similar[0]], 'score' : 1 - min_dist})
f = open('google-plus/matrix.json', 'w')
f.write(json.dumps({'nodes' : viz_nodes, 'links' : viz_links}, indent=1))
f.close()
# html:
# curl -O https://raw.githubusercontent.com/ptwobrussell/Mining-the-Social-Web-2nd-Edition/master/ipynb/resources/ch04-googleplus/viz/matrix.html
# css:
# curl -O https://raw.githubusercontent.com/ptwobrussell/Mining-the-Social-Web-2nd-Edition/blob/master/ipynb/resources/ch04-googleplus/viz/style.css
# original beautiful page:
# http://bost.ocks.org/mike/miserables/
# python -m http.server 9000
# http://localhost:9000/matrix.html
# -------------------------------- 10.-------------------------------
# 关于词语搭配, 两两搭配(bigrams)n个词有n-1个组合
# ["Mr.", "Green", "killed", "Colonel", "Mustard"]
# ->[("Mr.", "Green"), ("Green", "killed"), ("killed", "Colonel"), ("Colonel", "Mustard")]
import nltk
sentence = "Mr. Green killed Colonel Mustard in the study with the " + \
"candlestick. Mr. Green is not a very nice fellow."
print(list(nltk.ngrams(sentence.split(), 2)))
txt = nltk.Text(sentence.split())
txt.collocations()
# -------------------------------- 11.-------------------------------
# txt.collocations()只是打印输出没有返回值,根据源码可以很方便地仿制
# nltk\text.py
N = 25 # 寻找的搭配数量
all_tokens = [token for activity in data for token in activity['object']['content'
].lower().split()]
finder = nltk.BigramCollocationFinder.from_words(all_tokens)
finder.apply_freq_filter(2)
finder.apply_word_filter(lambda w: w in nltk.corpus.stopwords.words('english'))
scorer = nltk.BigramAssocMeasures.jaccard
collocations = finder.nbest(scorer, N)
for collocation in collocations:
c = ' '.join(collocation)
print(c)