bhubbard
10/31/2016 - 9:35 PM

Quick helper script for mining zillow data / commute times

Quick helper script for mining zillow data / commute times

"""
quick script put together to scrape zillow listing data
at the very least it functions so you don't need to go back and forth to calculate commute times
very messy... maybe i'll revisit this someday

"""
import requests
import re
import csv

from pyzillow import pyzillow
from bs4 import BeautifulSoup

from gmaps import Geocoding,Directions
import json

# AUTH FOR ZILLOW
zwsid = "zillow_id"
zillow_data = pyzillow.ZillowWrapper(zwsid)

# Initialize a directions class for Google Maps
api2 = Directions()

"""
or construct URL based on different parameters...


beds = 
bath =
town =
state =
building =
min_rent = 
max_rent = 
...

"""
dest_1 = "42.463441, -71.266918" # location 1
dest_2 ="42.74625, -71.181849" #  location 2

# Sample URL
url = "http://www.zillow.com/homes/for_rent/Cambridge-MA/house,condo,apartment_duplex,townhouse_type/3934_rid/2-_beds/1-_baths/0-757142_price/0-2800_mp/days_sort/42.396016,-71.035795,42.360764,-71.188574_rect/12_zm/")


def extractID(to_parse):
	""" just for cleanliness"""
	address = re.findall("/homedetails/(.*)/[1-9]",to_parse)[0].replace("-"," ")[:-5]
	zipcode = re.findall("/homedetails/(.*)/[1-9]",to_parse)[0].replace("-","")[-5:]
	return (str(address),str(zipcode))

def downURL(url):
	""" downloads individual listing links for apts matching search criteria"""
	# append list of results to id_list
	id_list = []
	r = requests.get(url)
	soup = BeautifulSoup(r.text)
	results = soup.findAll("a", {"class" : "hdp-link routable"})
	for i in range(len(results)):
		# [id_list.append(extractID(str(results[i]))) for i in range(len(results))]
		return id_list

def getTravelDuration(route):
	""" calculate commute time"""
	for i in route:
	jsonstring=json.dumps(i)
	parsed_route = json.loads(jsonstring)

	# parse through JSON to get lat,lng and time at each step of the way
	for step in parsed_route['legs'][0]['steps']:
		lat = step['start_location']['lat']
		duration = step['duration']['value'] + duration
	return duration * 60 # duration is in seconds

def getZillowData(add):
	""" create a dictionary of property data """
	property_data = {}
	property_details = zillow_data.get_deep_search_results(add[0], add[1])
	result = pyzillow.GetDeepSearchResults(property_details)

	property_data['zillow_id'] = result.zillow_id
	property_data['lat'] = result.latitude
	property_data['lon'] = result.longitude
	property_data['size'] = result.longitude
	property_data['bath'] = result.longitude
	property_data['bed'] = result.longitude
	property_data['details'] = result.home_detail_link

	# Extract Price
	price_getter = requests.get(result.home_detail_link)
	soup = BeautifulSoup(price_getter.text)
	parse_me = soup.findAll("div", {"class" : "main-row home-summary-row"})
	property_data['price'] = re.findall("\$[^a-z],\d\d\d",price)[0]

	# Set route parameters
	route1 = api2.directions(add,dest_1,mode="driving")
	route2 = api2.directions(add,dest_2,mode="transit")

	# Get commute times
	property_data['dist_1'] = getTravelDuration(route1)
	property_data['dist_2'] = getTravelDuration(route2)
	return property_data

def toCSV(url):
	""" write out results to CSV """
	prop_list = downURL(url)
	properties= []

	# Get data for each property
	for i in prop_list:
		properties.append(getZillowData(i))
	
	keys = properties[0].keys()
	# Write to CSV file
	with open('filename.csv', 'wb') as outfile:
	    dict_writer = csv.DictWriter(outfile, keys)
	    dict_writer.writeheader()
	    dict_writer.writerows(properties)