Leidingfei
10/2/2019 - 12:56 PM

spyder

# -*- coding: utf-8 -*-
"""
Created on Mon Oct  8 10:47:46 2018

@author: dingfei
"""

import requests
import json
from bs4 import BeautifulSoup
#from urllib.request import urlopen
#from urllib.request import Request
import re
import socket
import time
from pandas import DataFrame
#请求头里面要添加Cookie
headers = {'Accept': 'application/json, text/plain, */*',
           'Accept-Encoding': 'gzip, deflate, br',
           'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
           'etag':'W/"7c7edf6fd4239861f696426091c6de38"',
           'Host': 'www.indiegogo.com',
           'Content_Type':'application/json;charset=utf-8',
           'If-None-Match': 'W/"fa7571dc6b68ed2ad6b530ca10a18ef3"',
           'Referer': 'https://www.indiegogo.com/explore/home',
           'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_1 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A402 Safari/604.1',
            'Cookie':'romref=shr-pica; romref_referer_host=www.indiegogo.com; cohort=www.indiegogo.com%7Cdir-XXXX%7Cshr-pica; visitor_id=59b931434d6ee85eee663e4760ac09a76445db065bc6acd98b88d42409e0cb6c; recent_project_ids=2407872%262417714%262417262%262384892; _session_id=da59389ae7968bb68521bf013b4fcaf3; _ga=GA1.2.200555809.1538962447; _gid=GA1.2.1681687736.1538962447; CookieConsent=-1; __hstc=223492548.3165943ada5fb7b8c70eb3a723d4d2ab.1538962486170.1538965202927.1538979952149.3; __hssrc=1; hubspotutk=3165943ada5fb7b8c70eb3a723d4d2ab; previous_account_ids=19555197; has_account=true; has_account_debug=1570516163; sailthru_hid=f580c6855d66897910ae324652e0303a5bbabe3191d15c41e70bbc23df9b0d826040b2da20ecc201770d0681; analytics_session_id=038e8b300f0270026e4d45b14b178da830ef7d59189c1d7509a9e490bae4ff01; __hssc=223492548.2.1538979952149; _gat=1'
          
            }
##获取Tech & Innovation 下的所有的产品列表
requestData = {'category_main':None,
               'category_top_level':None,
               'page_num	':1,
                'per_page':12,
                'project_timing':all,
                'project_type':'campaign',
                'sort':'trending'}

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
option=Options()
option.add_argument('-headless')
option.add_argument("User-Agent='Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_1 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A402 Safari/604.1User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_1 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A402 Safari/604.1'")
option.add_argument('Cookie="romref=shr-pica;romref_referer_host=www.indiegogo.com; cohort=www.indiegogo.com%7Cdir-XXXX%7Cshr-pica; visitor_id=59b931434d6ee85eee663e4760ac09a76445db065bc6acd98b88d42409e0cb6c; recent_project_ids=2407872%262417714%262417262%262384892; _session_id=da59389ae7968bb68521bf013b4fcaf3; _ga=GA1.2.200555809.1538962447; _gid=GA1.2.1681687736.1538962447; CookieConsent=-1; __hstc=223492548.3165943ada5fb7b8c70eb3a723d4d2ab.1538962486170.1538965202927.1538979952149.3; __hssrc=1; hubspotutk=3165943ada5fb7b8c70eb3a723d4d2ab; previous_account_ids=19555197; has_account=true; has_account_debug=1570516163; sailthru_hid=f580c6855d66897910ae324652e0303a5bbabe3191d15c41e70bbc23df9b0d826040b2da20ecc201770d0681; analytics_session_id=038e8b300f0270026e4d45b14b178da830ef7d59189c1d7509a9e490bae4ff01; __hssc=223492548.2.1538979952149; _gat=1"')
browser = webdriver.Firefox(executable_path='geckodriver',firefox_options=option)
browser.get("https://www.indiegogo.com/explore/tech-innovation?project_type=campaign&project_timing=all&sort=trending")

#element=browser.find_element_by_xpath("//div/div[@class='discoverableCard-title ng-binding discoverableCard-lineClamp2']")
#clike=browser.find_element_by_xpath("//div/a[@class='i-cta-1 ng-binding ng-isolate-scope']")

#def get_text(element):
##对于每个element
#    text=element.text.split('\n')
#    if(len(text)==8):
##        stitle=text[1].lower()
##        stitle=stitle.replace(':','')
##        stitle=stitle.replace('-','')
##        stitle='-'.join(stitle.split(' '))
#        title.append(text[1])
#        tagline.append(text[2])
#        category.append(text[3])
#        balance.append(text[4][1:])
#        currency_code.append(text[5].split(' ')[0])
#        collect_percent.append(text[6])
#        timeLeft.append(text[7])
#    elif(len(text)==9):
#        title.append(text[1])
#        tagline.append(text[2])
#        category.append(text[3])
#        balance.append(text[4][1:])
#        currency_code.append(text[5].split(' ')[0])
#        collect_percent.append(text[6])
#        timeLeft.append('Now funding through InDemend')
##        url="https://www.indiegogo.com/projects/"+stitle+"/x/19555197#/"
##        req=requests.get(url,headers=headers)
##        html=req.text
##        req.close()
##        rgx = re.compile("\"overview\"\:\"(.*?)\",\"overview_image_url\"",re.S)
##        string=rgx.findall(html)
##        stri=''.join(string).replace('\\n','')
##        stri=stri.replace('\\','')
##        overview.append(stri)
#
#try:
#    print(clike.text)
#    for i in range(100):
#        try:
#            clike.click()
#            time.sleep(4)
#            print(str(i)+'',end=' ')
#        except Exception as e:
#            print(e)
#            continue
#    print('开始爬取项目--------')
#    elements=browser.find_elements_by_xpath("//div[div/@class='discoverableCard-title ng-binding discoverableCard-lineClamp2']")
##    elements2=browser.find_element_by_xpath("//div[div/@class='discoverableCard-crowdfundingInDemand ng-scope']")
#
#    n=0
#    title=[]
#    tagline=[]
#    category=[]
#    balance=[]
#    currency_code=[]
#    collect_percent=[]
#    timeLeft=[]
#    for element in elements:
#        print(str(n)+' ',end=' ')
#        n=n+1
#        get_text(element)
#    data=DataFrame({'title':title,'tagline':tagline,'balance':balance,'currency_code':currency_code,'collect_percent':collect_percent,'category':category,'timeLeft':timeLeft})
#    columns=['title','tagline','category','balance','currency_code','collect_percent','timeLeft']
#    data.to_excel('indiegogo2.xlsx',columns=columns)
#    
##    title=[]
##    tagline=[]
##    category=[]
##    balance=[]
##    currency_code=[]
##    collect_percent=[]
##    timeLeft=[]
##    for element in elements2:
##        print(str(n)+' ',end=' ')
##        n=n+1
##        get_text2(element)
##    data=DataFrame({'title':title,'tagline':tagline,'balance':balance,'currency_code':currency_code,'collect_percent':collect_percent,'category':category,'timeLeft':timeLeft})
##    columns=['title','tagline','category','balance','currency_code','collect_percent','timeLeft']
##    data.to_excel('indiegogo2.xlsx',columns=columns)
#    browser.quit()
#except Exception as e:
#    print(e)
#    browser.quit()