jinxuan
2/11/2016 - 8:49 PM

get1000Records.py

import string

import requests
from bs4 import BeautifulSoup
from pprint import pprint
import codecs
import time
import google
f = codecs.open('bullhorn.txt', encoding='utf-8', mode='r')

queryList = [line.strip() for line in f if line.strip() != '']

countOfFile = 0
for query in queryList:
    #Change
    query = filter(lambda x: x in string.printable, query)
    googleResultGenerator = google.search('site:bullhornreach.com ' + query)
    print "current querying bullhorn" + query
    countOfGoogleResultInOnePage = 0
    fileName = "jOutput" +str(countOfFile) + '.txt'
    resultOfOneQuery = ""
    JobTitle = query
    for googleResultOfOneQuery in googleResultGenerator:
        print "current result is " + str(countOfGoogleResultInOnePage)
        if countOfGoogleResultInOnePage > 5:
            break
        time.sleep(1)
        if "bullhornreach" not in googleResultOfOneQuery:
            countOfGoogleResultInOnePage+=1
            continue
        else:
            bullHornPage = requests.get(googleResultOfOneQuery).content
            soup = BeautifulSoup(bullHornPage, 'html.parser')
            title = soup.find('h1', class_='jobtitle')
            if(title == None):
                continue
            else:
                #[s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
                #visible_text = soup.getText()
                resultOfOneQuery += filter(lambda x: x in string.printable, visible_text)
        countOfGoogleResultInOnePage+=1

    if resultOfOneQuery.strip() != "":
        file = codecs.open('./data/'+fileName, 'w', 'utf-8')
        file.write(JobTitle + '\n')
        file.write(resultOfOneQuery)
        file.close()
        countOfFile+=1
    else:
        print "no match is found"