k.j.lang.tech
10/3/2015 - 12:23 PM

Web Crawling (Test)

Web Crawling (Test)

import requests
from bs4 import BeautifulSoup


def trade_spider():

    url = 'http://www.reddit.com/r/pics/'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0'}

    source_code = requests.get(url, headers=headers)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, "html.parser")
    for link in soup.findAll('a',{'class':'title may-blank '}):
        href = link.get('href')
        title = link.string
        print(href)
        print(title[:80])

trade_spider()

<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
  <component name="VagrantProjectSettings">
    <option name="instanceFolder" value="" />
    <option name="provider" value="" />
  </component>
</project>
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$" />
    <orderEntry type="jdk" jdkName="Python 3.5.0 (/usr/local/Cellar/python3/3.5.0/Frameworks/Python.framework/Versions/3.5/bin/python3.5)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
</module>
<component name="DependencyValidationManager">
  <state>
    <option name="SKIP_IMPORT_STATEMENTS" value="false" />
  </state>
</component>
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
  <component name="ProjectModuleManager">
    <modules>
      <module fileurl="file://$PROJECT_DIR$/.idea/untitled.iml" filepath="$PROJECT_DIR$/.idea/untitled.iml" />
    </modules>
  </component>
</project>
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
  <component name="ProjectLevelVcsManager" settingsEditedManually="false">
    <OptionsSetting value="true" id="Add" />
    <OptionsSetting value="true" id="Remove" />
    <OptionsSetting value="true" id="Checkout" />
    <OptionsSetting value="true" id="Update" />
    <OptionsSetting value="true" id="Status" />
    <OptionsSetting value="true" id="Edit" />
    <ConfirmationsSetting value="0" id="Add" />
    <ConfirmationsSetting value="0" id="Remove" />
  </component>
  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5.0 (/usr/local/Cellar/python3/3.5.0/Frameworks/Python.framework/Versions/3.5/bin/python3.5)" project-jdk-type="Python SDK" />
</project>
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
  <component name="Encoding" useUTFGuessing="true" native2AsciiForPropertiesFiles="false">
    <file url="PROJECT" charset="UTF-8" />
  </component>
</project>