mutegdp
7/18/2017 - 5:06 PM

Scraping : Tipe-1 - Kontent berubah secara otomatis tanpa merubah URL (pakai JS)

Scraping : Tipe-1 - Kontent berubah secara otomatis tanpa merubah URL (pakai JS)

"""
Scraping    : Tipe-1 - Kontent berubah secara otomatis tanpa merubah URL (pakai JS)
Keterangan  : Contohnya adalah ifundraise.ie
"""

import time
from selenium import webdriver
from bs4 import BeautifulSoup

# inisialisasi
browser = webdriver.Firefox()
#browser.get('about:config')
urls = []

def save_txt(idx, lis): # save all grabbed projects url to txt
    with open("file-{}.txt".format(idx), "w") as output:
        for l in lis:
            output.write('{}\n'.format(l))

def grabbing(): # crawling
    # mendapatkan url indux
    url = input('Masukkan url induk: ')
    browser.get(url)

    get_links(1) # grab the home-page projects link

    def scrol(start, last):
        # Go to next page
        for page in range(start, last):
            try:
                browser.execute_script("submit_frm('pagging',document.form1,'','{}','','');".format(page))
                get_links(page)
                time.sleep(20)
            except: # jika halaman tidak termuat dengan benar, kembali ke page awal lalu ulangi proses
                browser.get(url)
                browser.execute_script("submit_frm('pagging',document.form1,'','{}','','');".format(page))
                get_links(page)
                time.sleep(20)

    while True: # jika mulai grabbing, awal(start) dan akhir(last-1) adalah halaman yang akan digrab
        proses = input('Scrol? start, last = ')
        if proses:
            start, last = proses.split()
            scrol(int(start), int(last))
        else:
            break

def get_links(idx): # grab data from current page
    html = browser.page_source
    soup = BeautifulSoup(html, "html.parser")

    all_url = soup.select("div.col-sm-2.col-xs-12.ifr-charityLists-readmore > a")

    for url in all_url:
        urls.append(url.get('href'))

    save_txt(idx, all_url)

while True: # want to grab the url?
    grab = input('grab? y/n ')
    if grab == 'y':
        grabbing()
    else:
        break

print('Program selesai')