webMax22
1/29/2018 - 3:32 PM

dom parser

<?
require($_SERVER["DOCUMENT_ROOT"]."/bitrix/header.php");

$arData[] = ["name" => "585GOLD", "href" => "/arendator/etach1/etach1_20.html"];
$arData[] = ["name" => "BEANHEARTS", "href" => "/arendator/etach3/etach3_120.html"];

CModule::IncludeModule('iblock');
$el = new CIBlockElement;

$arArendators = [];

$rsArendators = CIBlockElement::GetList(false, ['IBLOCK_ID' => 2, 'ACTIVE' => 'Y'], false, false, ['ID', 'IBLOCK_ID', 'NAME', 'PROPERTY_OLD_LINK']);
while($arArendator = $rsArendators->GetNext())
{
	$arArendators[$arArendator['NAME']] = $arArendator;
}

$dom = new DOMDocument;

$start = microtime(true);

$i = 0;

foreach($arData as $arItem)
{
	$rootDir = 'parsing/'.$arArendators[$arItem['name']]['ID'];

	if(is_dir($rootDir))
	{
		continue;
	}

	if(!$arArendators[$arItem['name']])
	{
		p('Элемента нету в базе');
		p($arItem);
		continue;
	}

	$link = 'http://site.ru'.$arItem['href'];
	$site = file_get_contents($link);

	$arItm = [
		'name' => $arItem['name'],
		'oldLink' => $link
	];

	$dom->loadHTML($site);

	$parentNode = $dom->getElementById("ContentContainerRight");

	$arNodes = $parentNode->childNodes;


	foreach($arNodes as $node)
	{
		if($node->tagName == 'table' && $node->getAttribute('class') == 'SecInfopageTable')
		{
			// Ищем картинку
			$imgNode = $node->getElementsByTagName('td')->item(0)->getElementsByTagName('img')->item(0);
			$arItm['img'] = $imgNode->getAttribute('src');

			// Ищем лого
			$logoNode = $node->getElementsByTagName('td')->item(1)->getElementsByTagName('div')->item(0)->getElementsByTagName('img');
			if($logoNode->length > 0)
			{
				$arItm['logo'] = $logoNode->item(0)->getAttribute('src');
			}

			// Ищем телефон
			$strongNodes = $node->getElementsByTagName('td')->item(1)->getElementsByTagName('strong');
			foreach($strongNodes as $strongNode)
			{
				if($strongNode->textContent == 'Телефон:')
				{
					$textNode = $strongNode->nextSibling;

					if($textNode->length > 0)
					{
						$arItm['phone'] = $textNode->textContent;
					}
				}
			}

			// Ищем ссылку сайт
			$strongNodes = $node->getElementsByTagName('td')->item(1)->getElementsByTagName('strong');
			foreach($strongNodes as $strongNode)
			{
				if($strongNode->textContent == 'Сайт:')
				{
					$aNode = $strongNode->nextSibling->nextSibling->getElementsByTagName('a');
					if($aNode->length > 0)
					{
						$arItm['link'] = $aNode->item(0)->getAttribute('href');
					}
				}
			}

			// Ищем соцсети
			$snNodes = $node->getElementsByTagName('td')->item(1)->getElementsByTagName('ul');
			foreach($snNodes as $snNode)
			{
				if($snNode->getAttribute('class') == 'contact_links')
				{
					$aNodes = $snNode->getElementsByTagName('a');
					foreach($aNodes as $aNode)
					{
						switch($aNode->getAttribute('class'))
						{
							case 'contact_vk':
								$arItm['sn']['vk'] = $aNode->getAttribute('href');
								break;
							case 'contact_fb':
								$arItm['sn']['fb'] = $aNode->getAttribute('href');
								break;
							case 'contact_inst':
								$arItm['sn']['ig'] = $aNode->getAttribute('href');
								break;
						}
					}
				}
			}
		}

		// Ищем описание
		if($node->tagName == 'p')
		{
			if($node->textContent != '')
			{
				$arItm['description'] .= '<p>'.$node->textContent.'</p>'."\n";
			}
		}

		// Ищем дополнительные картинки
		if($node->tagName == 'div' && $node->getAttribute('class') == 'PhotosContainer')
		{
			$aNodes = $node->getElementsByTagName('a');

			foreach($aNodes as $aNode)
			{
				$arItm['imgs'][] = $aNode->getAttribute('href');
			}
		}

	}

	// Обновляем свойства
	CIBlockElement::SetPropertyValues($arArendators[$arItm['name']]['ID'], 2, $arItm['phone'], 'PHONE');
	CIBlockElement::SetPropertyValues($arArendators[$arItm['name']]['ID'], 2, $arItm['link'], 'LINK');
	CIBlockElement::SetPropertyValues($arArendators[$arItm['name']]['ID'], 2, $arItm['sn']['fb'], 'FB_LINK');
	CIBlockElement::SetPropertyValues($arArendators[$arItm['name']]['ID'], 2, $arItm['sn']['vk'], 'VK_LINK');
	CIBlockElement::SetPropertyValues($arArendators[$arItm['name']]['ID'], 2, $arItm['sn']['ig'], 'IG_LINK');

	// Обновляем описание
	$arFields = [
		"DETAIL_TEXT_TYPE" =>"html",
		"DETAIL_TEXT" => html_entity_decode($arItm['description']),
	];

	$res = $el->Update($arArendators[$arItm['name']]['ID'], $arFields);


	// Заливаем фотки на сервак
	$rootDir = 'parsing/'.$arArendators[$arItm['name']]['ID'];
	$logoPath = $rootDir.'/logo';
	$imgsPath = $rootDir.'/images';

	if(!is_dir($rootDir))
	{
		mkdir($rootDir);
		mkdir($imgsPath);
		mkdir($logoPath);
	}

	if($arItm['logo'])
	{
		$logoLink = 'http://site.ru'.$arItm['logo'];
		$logoPic = $logoPath.'/'.basename($logoLink);

		file_put_contents($logoPic, file_get_contents($logoLink));
	}

	if($arItm['img'])
	{
		$imgsLink = 'http://site.ru'.$arItm['img'];
		$imgPic = $imgsPath.'/'.basename($imgsLink);

		file_put_contents($imgPic, file_get_contents($imgsLink));
	}

	if($arItm['imgs'])
	{
		foreach($arItm['imgs'] as $img)
		{
			$imgsLink = 'http://site.ru'.$img;
			$imgPic = $imgsPath.'/'.basename($imgsLink);

			file_put_contents($imgPic, file_get_contents($imgsLink));
		}
	}
}