luanvuhlu
11/24/2017 - 9:06 AM

Download books

Download books

from pyquery import PyQuery as pq
from lxml import etree
import urllib2

def next_chapter_list_page(chapters_list_page_pq):
	current_li=chapters_list_page_pq('#list-chapter ul.pagination li.active')
	next_li = current_li.next()
	a = next_li.find('a')
	return a.attr.href
def parse_list_chapters(chapters_list_page_pq):
	for a in chapters_list_page_pq('#list-chapter ul.list-chapter li a'):
		yield a.get('href')
def parse_chapter_content(chapter_pq):
	name_text = chapter_pq('.container.chapter a.chapter-title').text()
	name_tag = "<p class='chapter-name'><h2>%s</h2></p>" % name_text
	content = chapter_pq('.container.chapter .chapter-c').html()
	return {'name': name_tag, 'content': content}
def has_next_chapters_list_page(chapters_list_page_pq):
	return chapters_list_page_pq('#list-chapter ul.pagination li a span.glyphicon-menu-right')
def write_file(name, content):
	text_file = open(name, "w")
	text_file.write(content.encode('utf8'))
	text_file.close()
def get_book_content(chapters):
	book_content_arr = ["<html>", "<body>"]
	for chapter in chapters:
		book_content_arr.append(chapter['name']+chapter['content'])
	book_content_arr.extend(["</html>", "</body>"])
	return ''.join(book_content_arr)
def get_source(url):
	try:
		print url
		return urllib2.urlopen(url).read()
	except Exception as inst:
		print inst
def parse(url):
	chapters = []
	while True and url:
		source = get_source(url)
		source_pq = pq(source)
		for chapter_url in parse_list_chapters(source_pq):
			chapter_pq = pq(get_source(chapter_url))
			chapter = parse_chapter_content(chapter_pq)
			chapters.append(chapter)
		if has_next_chapters_list_page(source_pq):
			url = next_chapter_list_page(source_pq)
		else:
			break
	return chapters

def main():
	url = 'http://truyenfull.vn/luc-tien/'
	chapters = parse(url)
	book_content = get_book_content(chapters)
	write_file("luc tien.html", book_content)
	print "DONE"
if __name__ == '__main__':
	main()
import urllib2
from bs4 import BeautifulSoup


BASE_URL = 'http://isach.info/mobile/story.php?story=tru_tien_2__tieu_dinh&chapter='
CHAPTER_START = 2


def main():
	chapters = get_chapters_arr()
	book_content = get_book_content(chapters)
	write_file("test", book_content)


def write_file(name, content):
	text_file = open("%s.html" % name, "w")
	text_file.write(content.encode('utf8'))
	text_file.close()


def get_book_content(chapters):
	book_content_arr = ["<html>", "<body>"]
	for chapter in chapters:
		book_content_arr.append(chapter)
	book_content_arr.extend(["</html>", "</body>"])
	return ''.join(book_content_arr)


def get_chapters_arr():
	chapters = []
	for chapter_index in range(2, 133):
		chapters.append(get_chapter(chapter_index))
	return chapters


def get_chapter(chapter_index):
	url = get_url(chapter_index)
	page = urllib2.urlopen(url).read()
	soup = BeautifulSoup(page, "html.parser")
	chapter_name = soup.find(class_='ms_chapter')
	chapter_contents = ''.join([content.prettify() for content in soup.find_all(class_='ms_text')])
	chapter = chapter_name.prettify()+chapter_contents
	print type(chapter_name)  # TODO
	return chapter


def get_url(chapter_index):
	return BASE_URL+'%04d' % chapter_index


if __name__ == '__main__':
	main()