PHP API/Scraper Sample
<?php
namespace App\Hapenesia\GsmArena;
use App\Color;
use App\Spec;
use Carbon\Carbon;
use GuzzleHttp\Client;
use GuzzleHttp\Cookie\FileCookieJar;
use GuzzleHttp\Handler\CurlHandler;
use GuzzleHttp\HandlerStack;
use GuzzleHttp\Middleware;
use GuzzleHttp\TransferStats;
use Illuminate\Support\Facades\Cache;
use Sunra\PhpSimple\HtmlDomParser;
use Zend\Json\Json;
use function Stringy\create as s;
class Scraper
{
const PROVIDER_NAME = 'GSMARENA';
/**
* @var \GuzzleHttp\Client
*/
private $client;
private $testValue = 'test';
public function __construct()
{
$stack = new HandlerStack();
$stack->setHandler(new CurlHandler());
$stack->push(Middleware::cookies());
$stack->push(Middleware::prepareBody());
$stack->push(Middleware::redirect());
$stack->push(Middleware::httpErrors());
$cookieJar = new FileCookieJar(storage_path('app/scraper_cookies/'.uniqid(self::PROVIDER_NAME, true).'.txt'), true);
$this->client = new Client(
[
'handler' => $stack,
'cookies' => $cookieJar,
]
);
}
private function getFormattedGadgets()
{
/** @var Gadget[] $formattedGadgets */
$formattedGadgets = [];
if (Cache::has('gsmarena_autocomplete')) {
//
$data = Cache::get('gsmarena_autocomplete');
try {
if (!isset($data['url'])) {
throw new \Exception('NO URL');
}
if (!isset($data['formatted'])) {
throw new \Exception('NO formatted');
}
if ($data['url'] !== config('hapenesia.gsmarena.autocomplete_list_url')) {
throw new \Exception('MisMatch URL');
}
$formattedGadgets = $data['formatted'];
} catch (\Exception $e) {
$body = false;
Cache::forget('gsmarena_autocomplete');
}
}
if (!$formattedGadgets) {
$res = $this->client->request(
'GET',
config('hapenesia.gsmarena.autocomplete_list_url'),
['verify' => false]
);
$expiresAt = Carbon::now()->addMinute(config('hapenesia.gsmarena.minute_expired'));
$body = $res->getBody()->getContents();
$gadgets = Json::decode($body, Json::TYPE_ARRAY);
$manufacturers = $gadgets[0];
$series = $gadgets[1];
foreach ($series as $serie) {
if (!isset($formattedGadgets[$serie[1]])) {
$formattedGadgets[$serie[1]] = self::buildMatchedGadgetItem($serie, $manufacturers);
}
}
Cache::put(
'gsmarena_autocomplete',
[
'url' => config('hapenesia.gsmarena.autocomplete_list_url'),
'body' => $body,
'formatted' => $formattedGadgets,
],
$expiresAt
);
}
return $formattedGadgets;
}
public function search($gadgetName)
{
$formattedGadgets = $this->getFormattedGadgets();
$gadgetsMatched = [];
$searchParams = explode(' ', $gadgetName);
foreach ($formattedGadgets as $formattedGadget) {
$matched = s($formattedGadget->search)->containsAll($searchParams, false);
if ($matched) {
$gadgetsMatched[] = $formattedGadget;
}
}
return $gadgetsMatched;
}
public function clearListCache()
{
Cache::forget('gsmarena_autocomplete');
}
public static function buildMatchedGadgetItem($serie, $manufacturers)
{
$gadget = new Gadget();
$gadget->id = $serie[1];
$gadget->name = $serie[2];
$gadget->alias = $serie[3];
$gadget->thumb = 'https://cdn2.gsmarena.com/vv/bigpic/'.$serie[4];
$gadget->manufacturer = new Manufacturer();
$gadget->manufacturer->id = $serie[0];
$gadget->manufacturer->name = $manufacturers[$serie[0]];
$gadget->search = implode(
' ',
[
$gadget->manufacturer->name,
$gadget->name,
$gadget->alias,
]
);
return $gadget;
}
/**
* @param $gadgetId
*
* @return mixed
* @throws \Exception
*/
public function getSpecs($gadgetId)
{
$specs = [];
//scrap main features
$providerLink = false;
$res = $this->client->request(
'GET',
config('hapenesia.gsmarena.gadget_url').$gadgetId.'.php',
[
'verify' => false,
'allow_redirects' => array(
'max' => 2,
'referer' => false,
'protocols' => array(
'http',
'https',
),
),
'on_stats' => function (TransferStats $stats) use (&$providerLink) {
$providerLink = $stats->getEffectiveUri();
},
]
);
$providerLink = (string)$providerLink;
$body = $res->getBody()->getContents();
$dom = HtmlDomParser::str_get_html($body);
$specList = $dom->find('#specs-list', 0);
$tables = $specList->find('table');
foreach ($tables as $table) {
/**
* -spec
* -- spec name
* -- spec value
*/
$spec = $table->find('tr th', 0)->plaintext;
$normalizedSpec = s($spec)->slugify();
$specItems = $table->find('tr');
$specDetail = [];
foreach ($specItems as $specItem) {
$specName = $specItem->find('td.ttl', 0);
if ($specName) {
$specName = $specName->plaintext;
$normalizedSpecName = s($specName)->slugify();
$specValue = $specItem->find('td.nfo', 0)->plaintext;
$specValue = s($specValue)->trim();
$specDetail[(string)$normalizedSpecName] = [
'key' => (string)$normalizedSpecName,
'name' => $specName,
'value' => (string)$specValue,
];
}
}
$specs [(string)$normalizedSpec] = [
'key' => (string)$normalizedSpec,
'name' => $spec,
'specs' => $specDetail,
];
}
/**
* $table->unique(
* [
* 'provider_id',
* 'provider_name',
* ],
* 'provider_name_id'
* );
*
* $table->index('battery');
* $table->index('colors');
*/
//memory
$memories = self::getMemories($specs);
$screenSize = self::getScreenSize($specs);
$mainCameraRes = self::getMainCameraResolutions($specs);
$selfieCameraRes = self::getSelfieCameraResolutions($specs);
$cpuCore = self::getCpuCore($specs);
$battery = self::getBattery($specs);
$colors = self::getColors($specs);
$formattedGadgets = $this->getFormattedGadgets();
if (!isset($formattedGadgets[$gadgetId])) {
throw new \Exception('Gadget Tidak ditemukan');
}
if (!$providerLink) {
throw new \Exception('Provider Link '.self::PROVIDER_NAME.' tidak ditemukan');
}
$gadget = $formattedGadgets[$gadgetId];
//find manufacturer
$manufacturer = \App\Manufacturer::where('name', $gadget->manufacturer->name)->first();
if (!$manufacturer) {
$manufacturer = new \App\Manufacturer();
$manufacturer->name = $gadget->manufacturer->name;
$manufacturer->save();
}
$gadgetColors = [];
foreach ($colors as $colorName) {
$color = Color::where('name', $colorName)->first();
if (!$color) {
$color = new Color();
$color->name = $colorName;
$color->save();
}
$gadgetColors[] = $color;
}
$hapenesiaGadget = new \App\Gadget();
$hapenesiaGadget->name = $gadget->name;
$hapenesiaGadget->alias = $gadget->alias;
$hapenesiaGadget->searchable = $gadget->search;
$hapenesiaGadget->provider_id = $gadget->id;
$hapenesiaGadget->provider_name = self::PROVIDER_NAME;
$hapenesiaGadget->thumb_pic = $gadget->thumb;
$hapenesiaGadget->provider_link = $providerLink;
$hapenesiaGadget->manufacturer_name = $manufacturer->name;
$hapenesiaGadget->screen_size = $screenSize;
$hapenesiaGadget->cpu_core = $cpuCore[0];
$hapenesiaGadget->cpu_core_name = $cpuCore[1];
//multies
$hapenesiaGadget->main_camera_resolutions = $mainCameraRes;
$hapenesiaGadget->selfie_camera_resolutions = $selfieCameraRes;
$hapenesiaGadget->battery = $battery;
$hapenesiaGadget->memories = $memories;
$hapenesiaGadget->colors = $gadgetColors;
$hapenesiaGadget->manufacturer = $manufacturer;
// $hapenesiaGadget->save();
// $hapenesiaGadget->specs()->create($specs);
// $hapenesiaGadget->memories()->create($memories);
// $manufacturer->gadgets()->save($hapenesiaGadget);
// $hapenesiaGadget->searchable = $gadget->search;
$specData['provider'] = $specs;
$specData['hapenesia_gadget'] = $hapenesiaGadget;
return $specData;
}
/**
* @param $specs
*
* @return array
* @throws \Exception
*/
private static function getMemories($specs)
{
if (!isset($specs['memory']) || !$specs['memory']) {
throw new \Exception('Memory Tidak ditemukan');
}
if (!isset($specs['memory']['specs']['internal']['value'])) {
throw new \Exception('Memory internal Tidak ditemukan');
}
$memoryInternal = $specs['memory']['specs']['internal']['value'];
$parseds = explode(' or ', $memoryInternal);
$memories = [];
foreach ($parseds as $parse) {
$ints = explode(', ', $parse);
if (!isset($ints[1]) || !$ints[1]) {
throw new \Exception('Memory RAM internal Tidak ditemukan');
}
$ram = s($ints[1])->replace(' GB RAM', '');
$ram = (string)$ram;
if (!(int)$ram) {
throw new \Exception('RAM Tidak ditemukan');
}
$memories[] = ['ram' => (int)$ram];
}
if (!$memories) {
throw new \Exception('Data Memory Tidak ditemukan');
}
return $memories;
}
private static function getScreenSize($specs)
{
if (!isset($specs['display']) || !$specs['display']) {
throw new \Exception('display Tidak ditemukan');
}
if (!isset($specs['display']['specs']['size']['value'])) {
throw new \Exception('display size Tidak ditemukan');
}
$screenSize = $specs['display']['specs']['size']['value'];
$parsed = explode(', ', $screenSize);
$screenSize = s($parsed[0])->replace(' INCHES', '');
$screenSize = (string)$screenSize;
$screenSize = (double)$screenSize;
if (!$screenSize) {
throw new \Exception('screenSize Tidak ditemukan');
}
return $screenSize;
}
private static function getMainCameraResolutions($specs)
{
$mainCameraReses = [];
if (!isset($specs['camera']) || !$specs['camera']) {
throw new \Exception('camera Tidak ditemukan');
}
if (!isset($specs['camera']['specs']['primary']['value'])) {
throw new \Exception('camera primary Tidak ditemukan');
}
$mainCameraRes = $specs['camera']['specs']['primary']['value'];
$mainCameraResString = s($mainCameraRes);
if ($mainCameraResString->contains('Dual', false)) {
$mainCameraResString = $mainCameraResString->regexReplace('\([^)]*\)', '');
$mainCameraResString = $mainCameraResString->replace('Dual: ', '');
$parsed = explode(', ', (string)$mainCameraResString);
$parsed = explode(' + ', (string)$parsed[0]);
foreach ($parsed as $item) {
$mainCameraRes = s($item)->replace(' MP', '');
$mainCameraRes = $mainCameraRes->replace(' ', '');
$mainCameraRes = (string)$mainCameraRes;
$mainCameraRes = (double)$mainCameraRes;
if (!$mainCameraRes) {
throw new \Exception('$mainCameraRes Tidak ditemukan');
}
$mainCameraReses[] = ['resolution' => $mainCameraRes];
}
} else {
$parsed = explode(', ', $mainCameraRes);
$mainCameraRes = s($parsed[0])->replace(' MP', '');
$mainCameraRes = $mainCameraRes->regexReplace('\s.*', '');
$mainCameraRes = (string)$mainCameraRes;
$mainCameraRes = (double)$mainCameraRes;
if (!$mainCameraRes) {
throw new \Exception('$mainCameraRes Tidak ditemukan');
}
$mainCameraReses[] = ['resolution' => $mainCameraRes];
}
if (!$mainCameraReses) {
throw new \Exception('$mainCameraRes Tidak ditemukan');
}
return $mainCameraReses;
}
private static function getSelfieCameraResolutions($specs)
{
$mainCameraReses = [];
if (!isset($specs['camera']) || !$specs['camera']) {
throw new \Exception('camera Tidak ditemukan');
}
if (!isset($specs['camera']['specs']['secondary']['value'])) {
throw new \Exception('camera secondary Tidak ditemukan');
}
$mainCameraRes = $specs['camera']['specs']['secondary']['value'];
$mainCameraResString = s($mainCameraRes);
if ($mainCameraResString->contains('Dual', false)) {
$mainCameraResString = $mainCameraResString->regexReplace('\([^)]*\)', '');
$mainCameraResString = $mainCameraResString->replace('Dual ', '');
$parsed = explode(', ', (string)$mainCameraResString);
$parsed = explode(' + ', (string)$parsed[0]);
foreach ($parsed as $item) {
$mainCameraRes = s($item)->replace(' MP', '');
$mainCameraRes = $mainCameraRes->replace(' ', '');
$mainCameraRes = (string)$mainCameraRes;
$mainCameraRes = (double)$mainCameraRes;
if (!$mainCameraRes) {
throw new \Exception('$secondCameraRes Tidak ditemukan');
}
$mainCameraReses[] = ['resolution' => $mainCameraRes];
}
} else {
$parsed = explode(', ', $mainCameraRes);
$mainCameraRes = s($parsed[0])->replace(' MP', '');
$mainCameraRes = $mainCameraRes->regexReplace('\s.*', '');
$mainCameraRes = (string)$mainCameraRes;
$mainCameraRes = (double)$mainCameraRes;
if (!$mainCameraRes) {
throw new \Exception('$secondCameraRes Tidak ditemukan');
}
$mainCameraReses[] = ['resolution' => $mainCameraRes];
}
if (!$mainCameraReses) {
throw new \Exception('$secondCameraRes Tidak ditemukan');
}
return $mainCameraReses;
}
private static function getCpuCore($specs)
{
if (!isset($specs['platform']) || !$specs['platform']) {
throw new \Exception('platform Tidak ditemukan');
}
if (!isset($specs['platform']['specs']['cpu']['value'])) {
throw new \Exception('platform cpu Tidak ditemukan');
}
$cpu = s($specs['platform']['specs']['cpu']['value']);
$cpu = $cpu->replace(' ', '');
$cpu = $cpu->replace('-', '');
if ($cpu->contains('OctaCore', false)) {
return [
8,
'Octa Core',
];
} elseif ($cpu->contains('DualCore', false)) {
return [
2,
'Dual Core',
];
} elseif ($cpu->contains('QuadCore', false)) {
return [
4,
'Quad Core',
];
} elseif ($cpu->contains('HexaCore', false)) {
return [
6,
'Hexa Core',
];
} elseif ($cpu->contains('DecaCore', false)) {
return [
10,
'Deca Core',
];
} else {
return [
1,
'Single Core',
];
}
}
private static function getBattery($specs)
{
if (!isset($specs['battery']) || !$specs['battery']) {
throw new \Exception('battery Tidak ditemukan');
}
if (!isset($specs['battery']['specs']['nbsp']['value'])) {
throw new \Exception('battery nbsp Tidak ditemukan');
}
$bacteryV = false;
$battery = s($specs['battery']['specs']['nbsp']['value']);
$batteries = $battery->split(' ');
foreach ($batteries as $key => $battery) {
if ($battery->contains('mAH', false)) {
$bacteryV = $batteries[$key - 1];
break;
}
}
if (!$bacteryV) {
throw new \Exception('$bacteryV Tidak ditemukan');
}
$bacteryV = (string)$bacteryV;
$bacteryV = (double)$bacteryV;
return $bacteryV;
}
private static function getColors($specs)
{
if (!isset($specs['misc']) || !$specs['misc']) {
throw new \Exception('misc Tidak ditemukan');
}
if (!isset($specs['misc']['specs']['colors']['value'])) {
throw new \Exception('misc colors Tidak ditemukan');
}
$colors = false;
$color = s($specs['misc']['specs']['colors']['value']);
$colorsParsed = $color->split(', ');
foreach ($colorsParsed as $key => $colorsPars) {
$colorsPars = $colorsPars->regexReplace('\([^)]*\)', '')->titleize();
$colors[] = (string)$colorsPars;
}
if (!$colors) {
throw new \Exception('$colors Tidak ditemukan');
}
return $colors;
}
}