carlessanagustin
5/13/2016 - 7:52 AM

Finding a house with python - from: https://tudorbarbu.ninja/hacking-the-bcn-house-market/

Finding a house with python - from: https://tudorbarbu.ninja/hacking-the-bcn-house-market/

-- --------------------------------------------------------

--
-- Table structure for table `house`
--

CREATE TABLE IF NOT EXISTS `house` (
  `id` int(10) unsigned NOT NULL,
  `external_id` int(10) unsigned NOT NULL,
  `rejected` tinyint(1) DEFAULT '0',
  `url` varchar(256) COLLATE utf16_bin NOT NULL,
  `rooms` int(11) NOT NULL,
  `bathrooms` int(11) NOT NULL,
  `floor` int(11) NOT NULL,
  `street` text COLLATE utf16_bin NOT NULL,
  `price` int(11) NOT NULL
) ENGINE=InnoDB AUTO_INCREMENT=22 DEFAULT CHARSET=utf16 COLLATE=utf16_bin;

-- --------------------------------------------------------

--
-- Table structure for table `picture`
--

CREATE TABLE IF NOT EXISTS `picture` (
  `id` int(10) unsigned NOT NULL,
  `house_id` int(10) unsigned NOT NULL,
  `hash` varchar(255) COLLATE utf16_bin NOT NULL
) ENGINE=InnoDB AUTO_INCREMENT=204 DEFAULT CHARSET=utf16 COLLATE=utf16_bin;

--
-- Indexes for dumped tables
--

--
-- Indexes for table `house`
--
ALTER TABLE `house`
  ADD PRIMARY KEY (`id`), ADD UNIQUE KEY `external_id` (`external_id`), ADD KEY `rejected` (`rejected`);

--
-- Indexes for table `picture`
--
ALTER TABLE `picture`
  ADD PRIMARY KEY (`id`), ADD KEY `house_id` (`house_id`);

--
-- AUTO_INCREMENT for dumped tables
--

--
-- AUTO_INCREMENT for table `house`
--
ALTER TABLE `house`
  MODIFY `id` int(10) unsigned NOT NULL AUTO_INCREMENT,AUTO_INCREMENT=22;
--
-- AUTO_INCREMENT for table `picture`
--
ALTER TABLE `picture`
  MODIFY `id` int(10) unsigned NOT NULL AUTO_INCREMENT,AUTO_INCREMENT=204;
import scrapy
import peewee
import re
import urllib
import cStringIO
from PIL import Image
from playhouse.db_url import connect

db = connect('mysql://root:@127.0.0.1/house')

def dhash(image, hash_size = 8):
    image = image.convert('L').resize(
        (hash_size + 1, hash_size),
        Image.ANTIALIAS,
    )

    pixels = list(image.getdata())

    difference = []
    for row in xrange(hash_size):
        for col in xrange(hash_size):
            pixel_left = image.getpixel((col, row))
            pixel_right = image.getpixel((col + 1, row))
            difference.append(pixel_left > pixel_right)

    decimal_value = 0
    hex_string = []
    for index, value in enumerate(difference):
        if value:
            decimal_value += 2**(index % 8)
        if (index % 8) == 7:
            hex_string.append(hex(decimal_value)[2:].rjust(2, '0'))
            decimal_value = 0

    return ''.join(hex_string)

def hamming_distance(s1, s2):
    if len(s1) != len(s2):
        raise ValueError('Undefined for sequences of unequal length')
    return sum(ch1 != ch2 for ch1, ch2 in zip(s1, s2))

class House(peewee.Model):
    bathrooms = peewee.IntegerField()
    floor     = peewee.IntegerField()
    price     = peewee.IntegerField()
    rooms     = peewee.IntegerField()
    external  = peewee.IntegerField(db_column='external_id', unique=True)
    rejected  = peewee.IntegerField(index=True, null=True)
    street    = peewee.TextField()
    url       = peewee.CharField()

    class Meta:
        database = db

class Picture(peewee.Model):
    hash  = peewee.CharField()
    house = peewee.ForeignKeyField(House, related_name='pictures')

    class Meta:
        database = db

class HouseSpider(scrapy.Spider):
    name       = 'HouseSpider'
    start_urls = [
        'http://fotocasa.es/search/results.aspx?opi=36&bti=2&bsm=&tti=3&prchti=2&cu=es-ES&mode=1&maxp=600&minr=2&fs=true&ts=Barcelona%2520Capital&l=724,9,8,232,376,8019,0,0,0'
    ]

    def parse(self, response):
        for li in response.css('div#photo-content ul.listPhotos li'):
            url, external_id = self.extract_url_info(
                li.css(
                    'div.property-information a.property-location'
                )
                .xpath('@href')
                .extract()[0]
            )

            try:
                House.get(House.external == external_id)
            except peewee.DoesNotExist:
                yield scrapy.Request(
                    url=url,
                    callback=self.parse_secondary_request,
                    meta={
                        'url' : url,
                        'external_id' : external_id
                    }
                )

    def parse_secondary_request(self, response):
        rooms = response.css('#litRooms b::text').extract()[0]

        try:
            bathrooms = response.css('#litBaths b::text').extract()[0]
        except:
            bathrooms = 0

        street = response.css('h1.property-title::text').extract()[0].strip()
        comma   = street.find(',')
        street = street[9:comma if comma != -1 else len(street)]

        price = int(re.search(r'\d+', response.css('#priceContainer::text').extract()[0]).group())

        try:
            floor = int(re.search(r'\d+', response.css('#litFloor::text').extract()[0]).group())
        except:
            floor = 0

        duplicate = None
        insert    = False

        hashes = self.get_image_hashes(response)
        houses = House.select(House, Picture).join(Picture).where(
             House.rooms == rooms,
             House.bathrooms == bathrooms,
             House.floor == floor
         )

        def find(houses, hashes):
            for house in houses:
                for picture in house.pictures:
                    for hash in hashes:
                        if hamming_distance(picture.hash, hash) <= 10:
                            return house
            return None

        if houses.count() == 0:
            duplicate = None
        else:
            duplicate = find(houses, hashes)

        if duplicate == None:
            insert = True
        else:
            if duplicate.price > price:
                duplicate.delete_instance(True)
                insert = True

        if insert == True:
            house = House(
                bathrooms = bathrooms,
                floor     = floor,
                price     = price,
                rooms     = rooms,
                external  = response.meta['external_id'],
                street    = street,
                url       = response.meta['url']
            )
            house.save()
            for hash in hashes:
                picture = Picture(
                    house = house.id,
                    hash  = hash
                )
                picture.save()

    def get_image_hashes(self, response):
        hashes = []
        for img in response.css('.carousel_slide img'):
            path = img.xpath('@data-src').extract()
            if len(path) == 0:
                path = img.xpath('@src').extract()

            path = path[0]
            path = cStringIO.StringIO(urllib.urlopen(path).read())
            hashes.append(dhash(Image.open(path)))

        return hashes

    def extract_url_info(self, long_url):
        url = long_url[0:long_url.find('?')]
        external_id = url[url.rfind('-') + 1:]

        return url, external_id