kymbrik
9/25/2019 - 9:55 AM

ChromeExtractor Selenium

from selenium import webdriver
from selenium.webdriver.chrome.options import Options


class ChromeExtractor:

    @staticmethod
    def get_page_content(url: str) -> str:
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        d = webdriver.Chrome('/usr/bin/chromedriver', chrome_options=chrome_options)
        d.get(url)
        page_src = d.page_source
        d.close()
        return page_src

    @staticmethod
    def get_page_links(url: str) -> list:
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        d = webdriver.Chrome('/usr/bin/chromedriver', chrome_options=chrome_options)
        d.get(url)
        links = d.find_elements_by_css_selector(".list-post-body .entry-title a")
        result = []
        for i, web_element in enumerate(links):
            url = web_element.get_attribute('href')
            result.append(url)
        d.close()

        return result