pentatonicfunk
1/25/2018 - 6:45 PM

PHP API/Scraper Sample

PHP API/Scraper Sample

<?php

namespace App\Hapenesia\GsmArena;

use App\Color;
use App\Spec;
use Carbon\Carbon;
use GuzzleHttp\Client;
use GuzzleHttp\Cookie\FileCookieJar;
use GuzzleHttp\Handler\CurlHandler;
use GuzzleHttp\HandlerStack;
use GuzzleHttp\Middleware;
use GuzzleHttp\TransferStats;
use Illuminate\Support\Facades\Cache;
use Sunra\PhpSimple\HtmlDomParser;
use Zend\Json\Json;

use function Stringy\create as s;


class Scraper
{

    const PROVIDER_NAME = 'GSMARENA';

    /**
     * @var \GuzzleHttp\Client
     */
    private $client;

    private $testValue = 'test';

    public function __construct()
    {
        $stack = new HandlerStack();
        $stack->setHandler(new CurlHandler());
        $stack->push(Middleware::cookies());
        $stack->push(Middleware::prepareBody());
        $stack->push(Middleware::redirect());
        $stack->push(Middleware::httpErrors());
        $cookieJar    = new FileCookieJar(storage_path('app/scraper_cookies/'.uniqid(self::PROVIDER_NAME, true).'.txt'), true);
        $this->client = new Client(
            [
                'handler' => $stack,
                'cookies' => $cookieJar,
            ]
        );
    }

    private function getFormattedGadgets()
    {
        /** @var Gadget[] $formattedGadgets */
        $formattedGadgets = [];
        if (Cache::has('gsmarena_autocomplete')) {
            //
            $data = Cache::get('gsmarena_autocomplete');
            try {
                if (!isset($data['url'])) {
                    throw new \Exception('NO URL');
                }
                if (!isset($data['formatted'])) {
                    throw new \Exception('NO formatted');
                }
                if ($data['url'] !== config('hapenesia.gsmarena.autocomplete_list_url')) {
                    throw new \Exception('MisMatch URL');
                }

                $formattedGadgets = $data['formatted'];

            } catch (\Exception $e) {
                $body = false;
                Cache::forget('gsmarena_autocomplete');
            }
        }

        if (!$formattedGadgets) {
            $res = $this->client->request(
                'GET',
                config('hapenesia.gsmarena.autocomplete_list_url'),
                ['verify' => false]
            );

            $expiresAt = Carbon::now()->addMinute(config('hapenesia.gsmarena.minute_expired'));
            $body      = $res->getBody()->getContents();

            $gadgets       = Json::decode($body, Json::TYPE_ARRAY);
            $manufacturers = $gadgets[0];
            $series        = $gadgets[1];

            foreach ($series as $serie) {
                if (!isset($formattedGadgets[$serie[1]])) {
                    $formattedGadgets[$serie[1]] = self::buildMatchedGadgetItem($serie, $manufacturers);
                }
            }


            Cache::put(
                'gsmarena_autocomplete',
                [
                    'url'       => config('hapenesia.gsmarena.autocomplete_list_url'),
                    'body'      => $body,
                    'formatted' => $formattedGadgets,
                ],
                $expiresAt
            );
        }

        return $formattedGadgets;
    }

    public function search($gadgetName)
    {

        $formattedGadgets = $this->getFormattedGadgets();
        $gadgetsMatched   = [];
        $searchParams     = explode(' ', $gadgetName);
        foreach ($formattedGadgets as $formattedGadget) {
            $matched = s($formattedGadget->search)->containsAll($searchParams, false);
            if ($matched) {
                $gadgetsMatched[] = $formattedGadget;
            }
        }

        return $gadgetsMatched;


    }

    public function clearListCache()
    {
        Cache::forget('gsmarena_autocomplete');
    }

    public static function buildMatchedGadgetItem($serie, $manufacturers)
    {
        $gadget                     = new Gadget();
        $gadget->id                 = $serie[1];
        $gadget->name               = $serie[2];
        $gadget->alias              = $serie[3];
        $gadget->thumb              = 'https://cdn2.gsmarena.com/vv/bigpic/'.$serie[4];
        $gadget->manufacturer       = new Manufacturer();
        $gadget->manufacturer->id   = $serie[0];
        $gadget->manufacturer->name = $manufacturers[$serie[0]];
        $gadget->search             = implode(
            ' ',
            [
                $gadget->manufacturer->name,
                $gadget->name,
                $gadget->alias,
            ]
        );

        return $gadget;

    }

    /**
     * @param $gadgetId
     *
     * @return mixed
     * @throws \Exception
     */
    public function getSpecs($gadgetId)
    {
        $specs = [];
        //scrap main features

        $providerLink = false;
        $res          = $this->client->request(
            'GET',
            config('hapenesia.gsmarena.gadget_url').$gadgetId.'.php',
            [
                'verify'          => false,
                'allow_redirects' => array(
                    'max'       => 2,
                    'referer'   => false,
                    'protocols' => array(
                        'http',
                        'https',
                    ),
                ),
                'on_stats'        => function (TransferStats $stats) use (&$providerLink) {
                    $providerLink = $stats->getEffectiveUri();
                },
            ]
        );

        $providerLink = (string)$providerLink;
        $body         = $res->getBody()->getContents();
        $dom          = HtmlDomParser::str_get_html($body);
        $specList     = $dom->find('#specs-list', 0);


        $tables = $specList->find('table');
        foreach ($tables as $table) {
            /**
             * -spec
             * -- spec name
             * -- spec value
             */
            $spec           = $table->find('tr th', 0)->plaintext;
            $normalizedSpec = s($spec)->slugify();
            $specItems      = $table->find('tr');

            $specDetail = [];
            foreach ($specItems as $specItem) {
                $specName = $specItem->find('td.ttl', 0);
                if ($specName) {
                    $specName           = $specName->plaintext;
                    $normalizedSpecName = s($specName)->slugify();

                    $specValue                               = $specItem->find('td.nfo', 0)->plaintext;
                    $specValue                               = s($specValue)->trim();
                    $specDetail[(string)$normalizedSpecName] = [
                        'key'   => (string)$normalizedSpecName,
                        'name'  => $specName,
                        'value' => (string)$specValue,
                    ];
                }

            }


            $specs [(string)$normalizedSpec] = [
                'key'   => (string)$normalizedSpec,
                'name'  => $spec,
                'specs' => $specDetail,

            ];
        }


        /**
         * $table->unique(
         * [
         * 'provider_id',
         * 'provider_name',
         * ],
         * 'provider_name_id'
         * );
         *
         * $table->index('battery');
         * $table->index('colors');
         */

        //memory
        $memories        = self::getMemories($specs);
        $screenSize      = self::getScreenSize($specs);
        $mainCameraRes   = self::getMainCameraResolutions($specs);
        $selfieCameraRes = self::getSelfieCameraResolutions($specs);
        $cpuCore         = self::getCpuCore($specs);
        $battery         = self::getBattery($specs);
        $colors          = self::getColors($specs);


        $formattedGadgets = $this->getFormattedGadgets();
        if (!isset($formattedGadgets[$gadgetId])) {
            throw new \Exception('Gadget Tidak ditemukan');
        }

        if (!$providerLink) {
            throw new \Exception('Provider Link '.self::PROVIDER_NAME.' tidak ditemukan');
        }
        $gadget = $formattedGadgets[$gadgetId];

        //find manufacturer
        $manufacturer = \App\Manufacturer::where('name', $gadget->manufacturer->name)->first();
        if (!$manufacturer) {
            $manufacturer       = new \App\Manufacturer();
            $manufacturer->name = $gadget->manufacturer->name;
            $manufacturer->save();
        }

        $gadgetColors = [];
        foreach ($colors as $colorName) {
            $color = Color::where('name', $colorName)->first();
            if (!$color) {
                $color       = new Color();
                $color->name = $colorName;
                $color->save();
            }
            $gadgetColors[] = $color;
        }


        $hapenesiaGadget       = new \App\Gadget();
        $hapenesiaGadget->name = $gadget->name;

        $hapenesiaGadget->alias         = $gadget->alias;
        $hapenesiaGadget->searchable    = $gadget->search;
        $hapenesiaGadget->provider_id   = $gadget->id;
        $hapenesiaGadget->provider_name = self::PROVIDER_NAME;
        $hapenesiaGadget->thumb_pic     = $gadget->thumb;
        $hapenesiaGadget->provider_link = $providerLink;

        $hapenesiaGadget->manufacturer_name = $manufacturer->name;

        $hapenesiaGadget->screen_size   = $screenSize;
        $hapenesiaGadget->cpu_core      = $cpuCore[0];
        $hapenesiaGadget->cpu_core_name = $cpuCore[1];

        //multies
        $hapenesiaGadget->main_camera_resolutions   = $mainCameraRes;
        $hapenesiaGadget->selfie_camera_resolutions = $selfieCameraRes;
        $hapenesiaGadget->battery                   = $battery;
        $hapenesiaGadget->memories                  = $memories;
        $hapenesiaGadget->colors                    = $gadgetColors;
        $hapenesiaGadget->manufacturer              = $manufacturer;
//        $hapenesiaGadget->save();

//        $hapenesiaGadget->specs()->create($specs);
//        $hapenesiaGadget->memories()->create($memories);


//        $manufacturer->gadgets()->save($hapenesiaGadget);

//        $hapenesiaGadget->searchable = $gadget->search;

        $specData['provider']         = $specs;
        $specData['hapenesia_gadget'] = $hapenesiaGadget;

        return $specData;


    }

    /**
     * @param $specs
     *
     * @return array
     * @throws \Exception
     */
    private static function getMemories($specs)
    {
        if (!isset($specs['memory']) || !$specs['memory']) {
            throw new \Exception('Memory Tidak ditemukan');
        }

        if (!isset($specs['memory']['specs']['internal']['value'])) {
            throw new \Exception('Memory internal Tidak ditemukan');
        }
        $memoryInternal = $specs['memory']['specs']['internal']['value'];
        $parseds        = explode(' or ', $memoryInternal);
        $memories       = [];
        foreach ($parseds as $parse) {
            $ints = explode(', ', $parse);
            if (!isset($ints[1]) || !$ints[1]) {
                throw new \Exception('Memory RAM internal Tidak ditemukan');
            }
            $ram = s($ints[1])->replace(' GB RAM', '');
            $ram = (string)$ram;
            if (!(int)$ram) {
                throw new \Exception('RAM Tidak ditemukan');
            }
            $memories[] = ['ram' => (int)$ram];
        }
        if (!$memories) {
            throw new \Exception('Data Memory Tidak ditemukan');
        }

        return $memories;
    }

    private static function getScreenSize($specs)
    {
        if (!isset($specs['display']) || !$specs['display']) {
            throw new \Exception('display Tidak ditemukan');
        }

        if (!isset($specs['display']['specs']['size']['value'])) {
            throw new \Exception('display size Tidak ditemukan');
        }
        $screenSize = $specs['display']['specs']['size']['value'];

        $parsed     = explode(', ', $screenSize);
        $screenSize = s($parsed[0])->replace(' INCHES', '');
        $screenSize = (string)$screenSize;
        $screenSize = (double)$screenSize;
        if (!$screenSize) {
            throw new \Exception('screenSize Tidak ditemukan');
        }

        return $screenSize;
    }

    private static function getMainCameraResolutions($specs)
    {

        $mainCameraReses = [];
        if (!isset($specs['camera']) || !$specs['camera']) {
            throw new \Exception('camera Tidak ditemukan');
        }

        if (!isset($specs['camera']['specs']['primary']['value'])) {
            throw new \Exception('camera primary Tidak ditemukan');
        }
        $mainCameraRes       = $specs['camera']['specs']['primary']['value'];
        $mainCameraResString = s($mainCameraRes);
        if ($mainCameraResString->contains('Dual', false)) {
            $mainCameraResString = $mainCameraResString->regexReplace('\([^)]*\)', '');
            $mainCameraResString = $mainCameraResString->replace('Dual: ', '');
            $parsed              = explode(', ', (string)$mainCameraResString);
            $parsed              = explode(' + ', (string)$parsed[0]);
            foreach ($parsed as $item) {
                $mainCameraRes = s($item)->replace(' MP', '');
                $mainCameraRes = $mainCameraRes->replace(' ', '');
                $mainCameraRes = (string)$mainCameraRes;
                $mainCameraRes = (double)$mainCameraRes;
                if (!$mainCameraRes) {
                    throw new \Exception('$mainCameraRes Tidak ditemukan');
                }
                $mainCameraReses[] = ['resolution' => $mainCameraRes];
            }
        } else {
            $parsed        = explode(', ', $mainCameraRes);
            $mainCameraRes = s($parsed[0])->replace(' MP', '');
            $mainCameraRes = $mainCameraRes->regexReplace('\s.*', '');
            $mainCameraRes = (string)$mainCameraRes;
            $mainCameraRes = (double)$mainCameraRes;
            if (!$mainCameraRes) {
                throw new \Exception('$mainCameraRes Tidak ditemukan');
            }
            $mainCameraReses[] = ['resolution' => $mainCameraRes];
        }

        if (!$mainCameraReses) {
            throw new \Exception('$mainCameraRes Tidak ditemukan');
        }


        return $mainCameraReses;
    }

    private static function getSelfieCameraResolutions($specs)
    {

        $mainCameraReses = [];
        if (!isset($specs['camera']) || !$specs['camera']) {
            throw new \Exception('camera Tidak ditemukan');
        }

        if (!isset($specs['camera']['specs']['secondary']['value'])) {
            throw new \Exception('camera secondary Tidak ditemukan');
        }
        $mainCameraRes       = $specs['camera']['specs']['secondary']['value'];
        $mainCameraResString = s($mainCameraRes);
        if ($mainCameraResString->contains('Dual', false)) {
            $mainCameraResString = $mainCameraResString->regexReplace('\([^)]*\)', '');
            $mainCameraResString = $mainCameraResString->replace('Dual ', '');
            $parsed              = explode(', ', (string)$mainCameraResString);
            $parsed              = explode(' + ', (string)$parsed[0]);
            foreach ($parsed as $item) {
                $mainCameraRes = s($item)->replace(' MP', '');
                $mainCameraRes = $mainCameraRes->replace(' ', '');
                $mainCameraRes = (string)$mainCameraRes;
                $mainCameraRes = (double)$mainCameraRes;
                if (!$mainCameraRes) {
                    throw new \Exception('$secondCameraRes Tidak ditemukan');
                }
                $mainCameraReses[] = ['resolution' => $mainCameraRes];
            }
        } else {
            $parsed        = explode(', ', $mainCameraRes);
            $mainCameraRes = s($parsed[0])->replace(' MP', '');
            $mainCameraRes = $mainCameraRes->regexReplace('\s.*', '');
            $mainCameraRes = (string)$mainCameraRes;
            $mainCameraRes = (double)$mainCameraRes;
            if (!$mainCameraRes) {
                throw new \Exception('$secondCameraRes Tidak ditemukan');
            }
            $mainCameraReses[] = ['resolution' => $mainCameraRes];
        }

        if (!$mainCameraReses) {
            throw new \Exception('$secondCameraRes Tidak ditemukan');
        }

        return $mainCameraReses;
    }

    private static function getCpuCore($specs)
    {
        if (!isset($specs['platform']) || !$specs['platform']) {
            throw new \Exception('platform Tidak ditemukan');
        }

        if (!isset($specs['platform']['specs']['cpu']['value'])) {
            throw new \Exception('platform cpu Tidak ditemukan');
        }

        $cpu = s($specs['platform']['specs']['cpu']['value']);
        $cpu = $cpu->replace(' ', '');
        $cpu = $cpu->replace('-', '');
        if ($cpu->contains('OctaCore', false)) {
            return [
                8,
                'Octa Core',
            ];
        } elseif ($cpu->contains('DualCore', false)) {
            return [
                2,
                'Dual Core',
            ];
        } elseif ($cpu->contains('QuadCore', false)) {
            return [
                4,
                'Quad Core',
            ];
        } elseif ($cpu->contains('HexaCore', false)) {
            return [
                6,
                'Hexa Core',
            ];
        } elseif ($cpu->contains('DecaCore', false)) {
            return [
                10,
                'Deca Core',
            ];
        } else {
            return [
                1,
                'Single Core',
            ];
        }

    }

    private static function getBattery($specs)
    {
        if (!isset($specs['battery']) || !$specs['battery']) {
            throw new \Exception('battery Tidak ditemukan');
        }

        if (!isset($specs['battery']['specs']['nbsp']['value'])) {
            throw new \Exception('battery nbsp Tidak ditemukan');
        }

        $bacteryV  = false;
        $battery   = s($specs['battery']['specs']['nbsp']['value']);
        $batteries = $battery->split(' ');
        foreach ($batteries as $key => $battery) {
            if ($battery->contains('mAH', false)) {
                $bacteryV = $batteries[$key - 1];
                break;
            }
        }

        if (!$bacteryV) {
            throw new \Exception('$bacteryV Tidak ditemukan');
        }

        $bacteryV = (string)$bacteryV;
        $bacteryV = (double)$bacteryV;

        return $bacteryV;

    }

    private static function getColors($specs)
    {
        if (!isset($specs['misc']) || !$specs['misc']) {
            throw new \Exception('misc Tidak ditemukan');
        }

        if (!isset($specs['misc']['specs']['colors']['value'])) {
            throw new \Exception('misc colors Tidak ditemukan');
        }

        $colors       = false;
        $color        = s($specs['misc']['specs']['colors']['value']);
        $colorsParsed = $color->split(', ');
        foreach ($colorsParsed as $key => $colorsPars) {
            $colorsPars = $colorsPars->regexReplace('\([^)]*\)', '')->titleize();
            $colors[]   = (string)$colorsPars;
        }

        if (!$colors) {
            throw new \Exception('$colors Tidak ditemukan');
        }


        return $colors;

    }

}