Download books
from pyquery import PyQuery as pq
from lxml import etree
import urllib2
def next_chapter_list_page(chapters_list_page_pq):
current_li=chapters_list_page_pq('#list-chapter ul.pagination li.active')
next_li = current_li.next()
a = next_li.find('a')
return a.attr.href
def parse_list_chapters(chapters_list_page_pq):
for a in chapters_list_page_pq('#list-chapter ul.list-chapter li a'):
yield a.get('href')
def parse_chapter_content(chapter_pq):
name_text = chapter_pq('.container.chapter a.chapter-title').text()
name_tag = "<p class='chapter-name'><h2>%s</h2></p>" % name_text
content = chapter_pq('.container.chapter .chapter-c').html()
return {'name': name_tag, 'content': content}
def has_next_chapters_list_page(chapters_list_page_pq):
return chapters_list_page_pq('#list-chapter ul.pagination li a span.glyphicon-menu-right')
def write_file(name, content):
text_file = open(name, "w")
text_file.write(content.encode('utf8'))
text_file.close()
def get_book_content(chapters):
book_content_arr = ["<html>", "<body>"]
for chapter in chapters:
book_content_arr.append(chapter['name']+chapter['content'])
book_content_arr.extend(["</html>", "</body>"])
return ''.join(book_content_arr)
def get_source(url):
try:
print url
return urllib2.urlopen(url).read()
except Exception as inst:
print inst
def parse(url):
chapters = []
while True and url:
source = get_source(url)
source_pq = pq(source)
for chapter_url in parse_list_chapters(source_pq):
chapter_pq = pq(get_source(chapter_url))
chapter = parse_chapter_content(chapter_pq)
chapters.append(chapter)
if has_next_chapters_list_page(source_pq):
url = next_chapter_list_page(source_pq)
else:
break
return chapters
def main():
url = 'http://truyenfull.vn/luc-tien/'
chapters = parse(url)
book_content = get_book_content(chapters)
write_file("luc tien.html", book_content)
print "DONE"
if __name__ == '__main__':
main()
import urllib2
from bs4 import BeautifulSoup
BASE_URL = 'http://isach.info/mobile/story.php?story=tru_tien_2__tieu_dinh&chapter='
CHAPTER_START = 2
def main():
chapters = get_chapters_arr()
book_content = get_book_content(chapters)
write_file("test", book_content)
def write_file(name, content):
text_file = open("%s.html" % name, "w")
text_file.write(content.encode('utf8'))
text_file.close()
def get_book_content(chapters):
book_content_arr = ["<html>", "<body>"]
for chapter in chapters:
book_content_arr.append(chapter)
book_content_arr.extend(["</html>", "</body>"])
return ''.join(book_content_arr)
def get_chapters_arr():
chapters = []
for chapter_index in range(2, 133):
chapters.append(get_chapter(chapter_index))
return chapters
def get_chapter(chapter_index):
url = get_url(chapter_index)
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, "html.parser")
chapter_name = soup.find(class_='ms_chapter')
chapter_contents = ''.join([content.prettify() for content in soup.find_all(class_='ms_text')])
chapter = chapter_name.prettify()+chapter_contents
print type(chapter_name) # TODO
return chapter
def get_url(chapter_index):
return BASE_URL+'%04d' % chapter_index
if __name__ == '__main__':
main()