JKChang2015
6/12/2018 - 2:05 PM

Web Crawlers

# ontology_com
# Created by JKChang
# 11/06/2018, 12:46
# Tag:
# Description: extract list of ontologies from the OLS

import requests
from bs4 import BeautifulSoup

url = 'https://www.ebi.ac.uk/ols/ontologies'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

tbody = soup.findAll('tbody')
ontos = tbody[0].findAll('tr')


class Ontology(object):
    def __init__(self, name, fullname, url, description):
        self.name = name
        self.fullname = fullname
        self.url = url
        self.description = description


res = []

for onto in ontos:
    # full name
    try:
        name = onto.findAll('a')[0].text
    except:
        name = onto.findAll('span')[0].text

    # abbreviation
    abb = onto.findAll('span', {"class": "ontology-source"})[0].text

    # url
    path = 'https://www.ebi.ac.uk/'
    try:
        url = onto.findAll('a')[0].attrs['href']
    except:
        url = ''

    # description
    des = onto.findAll('td')[2].text

    a = Ontology(name=abb, fullname=name, url=url, description=des)
    res.append(a)

meta = []
with open('./Desktop/ols.csv')as f:
    lines = f.readlines()
    for line in lines:
        meta.append(line.strip())

ols = []
for onto in res:
    ols.append(onto.name)


def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))


print(sorted(intersection(ols, meta)))
# Created by JKChang
# 08/08/2017, 12:26
# Tag:
# Description: films hyperlink exraction
 
import re
import urllib.request
 
url = 'http://videos.yizhansou.com/5004'
fp = urllib.request.urlopen(url)
mybytes = fp.read()
 
mystr = mybytes.decode('gbk').strip()
fp.close()
 
links = re.findall(r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')", mystr)
 
for link in links:
    if link.__contains__('ed2k'):
        print(link)
import urllib.request


from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


driver = webdriver.PhantomJS()
driver.get(r"https://bioportal.bioontology.org/ontologies")

wait = WebDriverWait(driver, 10)

# click proceed
proceed = wait.until(EC.presence_of_element_located((By.LINK_TEXT, "Proceed")))
proceed.click()

# wait for the content to be present
wait.until(EC.presence_of_element_located((By.ID, "workskin")))

soup = BeautifulSoup(driver.page_source, "html.parser")


with open('res.txt','w') as f:
    f.write(soup)