shafayeatsumit
9/27/2017 - 6:20 PM

gistfile1.txt

# -*- coding: utf-8 -*-
 
import json
import re
 
# Import Scrapy stuff
import scrapy
import time
from scrapy import signals
from scrapy.http import HtmlResponse
from scrapy_splash import SplashRequest
from scrapy.linkextractors import LinkExtractor
import random

from pyproj import Proj, transform

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

try:
    from shop_info.items import ShopInfoItem
except:
    from items import ShopInfoItem
 
try:
    from shop_info import useragent
except:
    import useragent
 

input_data = {
    'format': 
    [
        {
            'item_name': 'name', 
            'xpath': '//*[@id="topicPath"]/li[2]/text()', 
            'regex_match': 'Null', 
            'regex_item': 'Null', 
            'remove_tag': ''},
        {
            'item_name': 'zip', 
            'xpath': '//*[@id="header"]/p', 
            'regex_match': '〒([\d\-]+)\s*', 
            'regex_item': '$1 ', 
            'remove_tag': ''
            }, 
        {
            'item_name': 'address', 
            'xpath': '//*[@id="header"]/p', 
            'regex_match': '〒[\d\-]+\s*(.+)\s*TEL', 
            'regex_item': '$1 ', 
            'remove_tag': ''}, 
        {
            'item_name': 'tel', 
            'xpath': '//*[@id="header"]/p', 
            'regex_match': '〒[\d\-]+\s*.+\s*TEL\s*\:*\s*([\d\-\(\)\s]+)', 
            'regex_item': '$1', 
            'remove_tag': ''}
    ],

    'target': [

            {
                'id': '1', 
                'url': 'https://www.takashimaya.co.jp/rakusai/index.html'
                }, 
            {
                'id': '2', 
                'url': 'https://www.takashimaya.co.jp/okayama/index.html'
                },
            {
                'id': '3', 
                'url': 'https://www.takashimaya.co.jp/tachikawa/index.html'}, 
            {   
                'id': '4', 
                'url': 'https://www.takashimaya.co.jp/yokohama/index.html'}
        ],

    'coordination':[
            {
                 "item_name": "coordination",
                 "mapurl_xpath": '//*[@id="storeInfo"]/div/p/a',
                 "xpath": '//*[@id="mapDiv"]/div/div/div[10]/div/div/div/div[7]/div/a',
                 "regex_match": "https:\/\/maps\.google\.com\/maps\?.*ll=([\d\.]+),([\d\.]+)",
                 "lat": "$1",
                 "lon": "$2",
                 "mapurl_regex": "\/[^\/]+\/access\/index.html"
            }
        ]
}



lua_first_page = """
function main(splash)
  local url = splash.args.url
  assert(splash:go(url))
  assert(splash:wait(10))
  return {
    html = splash:html()
  }
end
"""

map_url_page = """
function main(splash)
  local url = splash.args.url
  assert(splash:go(url))
  assert(splash:wait(6))
  return {
    html = splash:html()
  }
end
"""

tiny_url_extract = """
    function main(splash)
      local url = splash.args.url
      assert(splash:go(url))
      assert(splash:wait(15))
      return {
        url = splash:url()
        html = splash:html()
      }
    end
"""

useragent_lists = useragent.user_agent_list
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'en-GB,en-US;q=0.8,en;q=0.6',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': useragent_lists[random.randrange(0, len(useragent_lists))],
}

def remove_tag(text, remove_tag):
    if int(remove_tag) == 1:
        text = re.sub('<[^<]+?>', '', text)
        return text
    elif int(remove_tag) == 2:
        text = re.sub('<[^<]+?>', ' ', text)
        return text
    elif int(remove_tag) == 3:
        text = re.sub('<[^<]+?>', '__tag__', text)
        return text
    return text


def cleanup_data(data):
    replaced_data = []
    for d in data:
        for k, v in d.items():
            if "¥" in v:
                v = v.replace("¥", "\\")
                d[k] = v
        replaced_data.append(d)
    if replaced_data:
        return replaced_data
    return data


class ShopSpider(scrapy.Spider):
    name = "shop_info"
        
    handle_httpstatus_list = [404, 500]
 
    def __init__(self, *args, **kwargs):
        super(ShopSpider, self).__init__(*args, **kwargs)

        try:
            self.driver = webdriver.Remote(command_executor='http://127.0.0.1:4444/wd/hub', desired_capabilities=DesiredCapabilities.CHROME)
        except:
            self.driver = webdriver.Remote(command_executor='http://138.68.241.86:4444/wd/hub', desired_capabilities=DesiredCapabilities.CHROME)

 
        self.input_data = kwargs.get('input_data', None)
        if self.input_data:
            self.input_data = json.loads(self.input_data)
        if not self.input_data:
          self.input_data = input_data
        self.target_urls = cleanup_data(self.input_data["target"])
        self.formats = cleanup_data(self.input_data["format"])
        self.coordinates = cleanup_data(self.input_data["coordination"])
 
    def start_requests(self):
        for target in self.target_urls:

            headers["User-Agent"] = useragent_lists[random.randrange(0, len(useragent_lists))]
 
            yield SplashRequest(target["url"], self.parse, errback=self.handle_error, headers=headers, meta={
                'id': target["id"], "url": target["url"], "http_status_from_error_code":True},
                endpoint='execute', args={'lua_source': lua_first_page, 'timeout': 90}
                )
 
    def parse(self, response):
        id = response.meta.get('id')
        url = response.meta.get('url')

        if response.status in self.handle_httpstatus_list:
            return scrapy.Request(url=url, callback=self.after_404, meta={
                'id': id, "url": url})
 
        self.output = ShopInfoItem()
        self.output["id"] = id
        self.output["URL"] = url
        xpath_log = []
        regex_log = []
        strings_from_xpath = []
        for fmt in self.formats:
            x_log = {}
            r_log = {}
            item = fmt["item_name"]
            item_val = None
            if "regex_match" in fmt and fmt["regex_match"] != "Null" and fmt["xpath"] !="Null":
                try:
                    # item_text = response.xpath('{0}'.format(fmt["xpath"])).extract_first()
                    item_text = response.xpath(fmt["xpath"]).extract_first()

                    if not item_text:
                        x_log["item"] = item
                        x_log["xpath"] = fmt["xpath"]
                        x_log["message"] = "Failed to get string from the Xpath"
                        xpath_log.append(x_log)
                    else:
                        try:
                            if "remove_tag" in fmt and fmt["remove_tag"] != "Null" and fmt["remove_tag"]:
                                item_text = remove_tag(item_text, fmt["remove_tag"])

                            line = re.search(r'{}'.format(fmt["regex_match"]), item_text.strip())
                            if line:
                                grp = re.findall(r'\d+', fmt["regex_item"])[0]
                                li = line.group(int(grp))
                                item_val = li
                            else:
                                r_log["item"] = item
                                r_log["item Text"] = item_text
                                r_log["regex"] = fmt["regex_match"]
                                r_log["message"] = "Failed to get item from Regex"
                                regex_log.append(r_log)
                                string_log = {"item": item, "string":item_text}
                                strings_from_xpath.append(string_log)
                        except Exception as e:
                            r_log["item"] = item
                            r_log["item Text"] = item_text
                            r_log["regex"] = fmt["regex_match"]
                            r_log["message"] = e
                            regex_log.append(r_log)
                            string_log = {"item": item, "string":item_text}
                            strings_from_xpath.append(string_log)

                except Exception as e:
                    r_log["item"] = item
                    r_log["xpath"] = fmt["xpath"]
                    r_log["message"] = e
                    xpath_log.append(r_log)
                    pass

            elif fmt["xpath"] !="Null":
                item_val = response.xpath(fmt["xpath"]).extract_first()
                if "remove_tag" in fmt and fmt["remove_tag"] != "Null" and fmt["remove_tag"]:
                    item_val = remove_tag(item_val, fmt["remove_tag"])
                if not item_val:
                    x_log["item"] = item
                    x_log["xpath"] = fmt["xpath"]
                    x_log["message"] = "Failed to get string from the Xpath"
                    xpath_log.append(x_log)
            elif "remove_tag" in fmt and fmt["remove_tag"] != "Null" and fmt["remove_tag"]:
                item_text = response.xpath(fmt["xpath"]).extract_first()
                item_text = remove_tag(item_text, fmt["remove_tag"])
                item_val = item_text
            self.output[item] = item_val
            self.output["xpath_log"] = xpath_log if len(xpath_log) else None
            self.output["regex_log"] = regex_log if len(regex_log) else None
            if len(strings_from_xpath):
                self.output["strings_from_xpath"] = strings_from_xpath
        
        
        for cords in self.coordinates:
            if cords and (cords["mapurl_regex"] == "Null" or cords["mapurl_regex"] == "" or "mapurl_regex" not in cords):
                if "tinymapurl_regex" in cords and cords["tinymapurl_regex"]:
                    match = re.search(r'{}'.format(cords["tinymapurl_regex"]), response.body.decode('utf-8'))
                    if match:
                        tiny_url = match.group()
                        yield SplashRequest(tiny_url, self.parse_tiny_url, errback=self.handle_error, headers=headers, meta={"http_status_from_error_code":True, "output":self.output, "cords":cords},
                            endpoint='render.json', args={'lua_source': tiny_url_extract, 'timeout': 90, 'iframes': 1, 'html': 1})
                    else:
                        regex_log = {"regex_log":{"message":"Failed to get item from this regex", "regex":cords["tinyurl_regex"], 'strings_from_xpath':'* whole html docment *'}, "xpath_log":""}
                        self.output["coordination"] = regex_log
                        yield self.output

                else:
                    item_name = "coordination"
                    try:
                        item_text = response.xpath(cords["xpath"]).extract_first()

                        match = None
                        if item_text:
                            match = re.search(r'{}'.format(cords["regex_match"]), item_text.strip())
                        else:
                            match = re.search(r'{}'.format(cords["regex_match"]), response.body.decode("utf-8"))
                        
                        if match:
                            org_lat = match.group(1)
                            org_lon = match.group(2)
                            if org_lat and org_lon and "srid" in cords and cords["srid"]:
                                inProj  = Proj("+init=EPSG:{}".format(cords["srid"]))
                                outProj = Proj("+init=EPSG:4326")
                                lon, lat = transform(inProj,outProj,org_lon,org_lat)
                                self.output[item_name] = {"lat": lat, "lon":lon, "org_lat":org_lat, "org_lon": org_lon}
                            else:
                                self.output[item_name] = {"lat": org_lat, "lon":org_lon}
                            yield self.output
                        else:
                            if item_text:
                                regex_log = {"regex_log":{"message":"Failed to get item from this regex", "regex":cords["regex_match"], 'strings_from_xpath':item_text}, "xpath_log":""}
                            else:
                                regex_log = {"regex_log":{"message":"Failed to get item from this regex", "regex":cords["regex_match"], 'strings_from_xpath':'* whole html docment *'}, "xpath_log":""}
                            self.output["coordination"] = regex_log
                            yield self.output

                    except Exception as e:
                        xpath_log = {"regex_log":"", "xpath_log":{"message":e, "xptah":cords["xpath"]}}
                        self.output["coordination"] = xpath_log
                        yield self.output

            elif  "mapurl_xpath" in cords and cords["mapurl_xpath"]:
                try:
                    
                    self.driver.get(url)
                    path = self.driver.find_element_by_xpath(cords["mapurl_xpath"]).get_attribute('outerHTML')
                    match = None

                    if path:
                        match = re.search(r'{}'.format(cords["mapurl_regex"]), path)
                    
                    if match:
                        self.driver.find_element_by_xpath(cords["mapurl_xpath"]).click()
                        time.sleep(5)
                        yield SplashRequest(self.driver.current_url, self.parse_map_url, errback=self.handle_error, headers=headers, meta={"http_status_from_error_code":True, "output":self.output, "cords":cords},
                        endpoint='render.json', args={'lua_source': map_url_page, 'timeout': 90, 'iframes': 1, 'html': 1})
                    else:
                        try:
                            self.driver.find_element_by_xpath(cords["mapurl_xpath"]).click()
                            time.sleep(5)
                            yield SplashRequest(self.driver.current_url, self.parse_map_url, errback=self.handle_error, headers=headers, meta={"http_status_from_error_code":True, "output":self.output, "cords":cords},
                            endpoint='render.json', args={'lua_source': map_url_page, 'timeout': 90, 'iframes': 1, 'html': 1})
                        except Exception as e:
                            if path:
                                regex_log = {"mapurl_regex_log":{"message":"Failed to get item from this Mapurl regex", "regex":cords["mapurl_regex"], 'strings_from_xpath':path}, "xpath_log":"",}
                                self.output["coordination"] = regex_log
                            else:
                                xpath_log = {"regex_log":"", "xpath_Log":{"message":e, "xptah":cords["mapurl_xpath"]}}
                                self.output["coordination"] = xpath_log
                            yield self.output

                except Exception as e:
                    xpath_log = {"regex_log":"", "xpath_Log":{"message":e, "xptah":cords["mapurl_xpath"]}}
                    self.output["coordination"] = xpath_log
                    yield self.output
            else:
                yield self.output

            if not self.coordinates:
                yield self.output

    def after_404(self, response):
        id = response.meta.get('id')
        url = response.meta.get('url')
        output = ShopInfoItem()
        output["id"] = id
        output["URL"] = url
        output["404"] = "Page not Found"

        yield output

    def handle_error(self, failure):
        # id = response.meta.get('id')
        # url = response.meta.get('url')

        url = failure.request.url
        Error = failure.type

        output = ShopInfoItem()
        output["URL"] = url
        output["Error"] = Error
        yield output

    def parse_map_url(self, response):
        from scrapy.http import HtmlResponse
        output = response.meta.get('output')
        cords = response.meta.get('cords')

        res = None
        if "childFrames" in response.data and response.data["childFrames"]:
            res = response.data["childFrames"][0]["html"]
      
        response = response.data["html"]
        response = HtmlResponse(url="Dummy URL", body=response,  encoding='utf-8')
        
        try:
            match = None
            item_text = response.xpath(cords["xpath"]).extract_first()
            item_name = "coordination"

            if item_text:
                match = re.search(r'{}'.format(cords["regex_match"]), item_text.strip())
            elif res:
                match = re.search(r'{}'.format(cords["regex_match"]), res)
                
            if match:
                lon_r = re.findall(r'\d+', cords["lon"])[0]
                lat_r = re.findall(r'\d+', cords["lat"])[0]
                org_lat = match.group(int(lat_r))
                org_lon = match.group(int(lon_r))
                if org_lat and org_lon and "srid" in cords and cords["srid"]:
                    inProj  = Proj("+init=EPSG:{}".format(cords["srid"]))
                    outProj = Proj("+init=EPSG:4326")
                    lon, lat = transform(inProj,outProj,org_lon,org_lat)
                    output[item_name] = {"lat": lat, "lon":lon, "org_lat":org_lat, "org_lon": org_lon}
                else:
                    output[item_name] = {"lat": org_lat, "lon":org_lon}
            else:
                if item_text:
                    regex_log = {"regex_log":{"message":"Failed to get item from this regex", "regex":cords["regex_match"], 'strings_from_xpath':item_text}, "xpath_log":""}
                else:
                    regex_log = {"regex_log":{"message":"Failed to get item from this regex", "regex":cords["regex_match"], 'strings_from_xpath':'* whole html docment *'}, "xpath_log":""}
                output["coordination"] = regex_log

        except Exception as e:
            regex_log = {"regex_log":"", "xpath_log":{"message":e, "xptah":cords["xpath"]}}
            output["coordination"] = regex_log

        yield output

    def parse_tiny_url(self, response):
        output = response.meta.get('output')
        cords = response.meta.get('cords')
        url = response.data["url"]
        regex = cords["regex_match"]
        match = re.search(r'{}'.format(regex), url)
        item_name = "coordination"
        if match:
            lon_r = re.findall(r'\d+', cords["lon"])[0]
            lat_r = re.findall(r'\d+', cords["lat"])[0]
            print("Group:", lon_r, lat_r)
            org_lat = match.group(1)
            org_lon = match.group(2)
            if org_lat and org_lon and "srid" in cords and cords["srid"]:
                inProj  = Proj("+init=EPSG:{}".format(cords["srid"]))
                outProj = Proj("+init=EPSG:4326")
                lon, lat = transform(inProj,outProj,org_lon,org_lat)
                output[item_name] = {"lat": lat, "lon":lon, "org_lat":org_lat, "org_lon": org_lon}
            else:
                output[item_name] = {"lat": org_lat, "lon":org_lon}
        else:
            regex_log = {"regex_log":{"message":"Failed to get item from this Mapurl regex", "regex":cords["regex_match"], 'strings_from_xpath':'* whole html docment *'}, "xpath_log":""}
            output["coordination"] = regex_log
            
        yield output

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(ShopSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.spider_opened, signals.spider_opened)
        crawler.signals.connect(spider.spider_closed, signals.spider_closed)
        return spider

    def spider_opened(self, spider):
        print('Opening {} spider'.format(spider.name))

    def spider_closed(self, spider):
        self.driver.quit()
        print('Closing {} spider'.format(spider.name))