Scraping : Tipe-1 - Kontent berubah secara otomatis tanpa merubah URL (pakai JS)
"""
Scraping : Tipe-1 - Kontent berubah secara otomatis tanpa merubah URL (pakai JS)
Keterangan : Contohnya adalah ifundraise.ie
"""
import time
from selenium import webdriver
from bs4 import BeautifulSoup
# inisialisasi
browser = webdriver.Firefox()
#browser.get('about:config')
urls = []
def save_txt(idx, lis): # save all grabbed projects url to txt
with open("file-{}.txt".format(idx), "w") as output:
for l in lis:
output.write('{}\n'.format(l))
def grabbing(): # crawling
# mendapatkan url indux
url = input('Masukkan url induk: ')
browser.get(url)
get_links(1) # grab the home-page projects link
def scrol(start, last):
# Go to next page
for page in range(start, last):
try:
browser.execute_script("submit_frm('pagging',document.form1,'','{}','','');".format(page))
get_links(page)
time.sleep(20)
except: # jika halaman tidak termuat dengan benar, kembali ke page awal lalu ulangi proses
browser.get(url)
browser.execute_script("submit_frm('pagging',document.form1,'','{}','','');".format(page))
get_links(page)
time.sleep(20)
while True: # jika mulai grabbing, awal(start) dan akhir(last-1) adalah halaman yang akan digrab
proses = input('Scrol? start, last = ')
if proses:
start, last = proses.split()
scrol(int(start), int(last))
else:
break
def get_links(idx): # grab data from current page
html = browser.page_source
soup = BeautifulSoup(html, "html.parser")
all_url = soup.select("div.col-sm-2.col-xs-12.ifr-charityLists-readmore > a")
for url in all_url:
urls.append(url.get('href'))
save_txt(idx, all_url)
while True: # want to grab the url?
grab = input('grab? y/n ')
if grab == 'y':
grabbing()
else:
break
print('Program selesai')