Shoora
1/26/2018 - 11:48 AM

Simple sport results parser in PHP using XPath. For more information visit http://martinsikora.com/parsing-html-pages-using-xpath

Simple sport results parser in PHP using XPath. For more information visit http://martinsikora.com/parsing-html-pages-using-xpath

<?php

$curl = curl_init('http://www.livescore.com/soccer/england/');
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10');
$html = curl_exec($curl);
curl_close($curl);

if (!$html) {
    die("something's wrong!");
}

//var_dump(strlen($data));

$dom = new DOMDocument();
@$dom->loadHTML($html);

$xpath = new DOMXPath($dom);

$scores = array();

$tableRows = $xpath->query('//table[1]//tr[4]//table//tr[1]/td[5]//table//tr');
foreach ($tableRows as $row) {
    // fetch all 'tds' inside this 'tr'
    $td = $xpath->query('td', $row);
    $match = array();
    
    // check league heading
    if ($td->length == 1 && $xpath->query('td/b', $row)->length == 1) {
        
        // cut the country name and leave just the league
        $league = substr($xpath->query('td/text()', $row)->item(1)->textContent, 3);
        $scores[$league] = array();
    } elseif ($td->length == 2) { // date
        $month = date('m', strtotime(substr($td->item(1)->textContent, 0, strpos($td->item(1)->textContent, ' '))));
        $day = sprintf('%02s', preg_replace('/[^0-9]/i', '', substr($td->item(1)->textContent, strpos($td->item(1)->textContent, ' ') + 1)));
        $thisMonth = date('m');
        $thisYear = date('Y');
        if ($thisMonth - $month < 0) {
            $date = ($thisYear - 1) . '-' . $month . '-' . $day;
        } elseif ($thisMonth - $month > 0) {
            $date = ($thisYear + 1) . '-' . $month . '-' . $day;
        } else {
            $date = $thisYear . '-' . $thisMonth . '-' . $day;
        }
    } elseif ($td->length == 4) { // check match result
        /**
         *  first column contains match status. This can be:
         *    FT     - match finished
         *    Pen.   - match finished after penalties
         *    Postp. - match postponed to another day
         *    hh:mm  - upcoming match
         *    mm'    - pending match
         */
        $status = preg_replace('/[^a-zA-Z0-9\'\.:]*/i', '', $td->item(0)->textContent);
        if ($status == 'FT') {
            $match['status'] = 'finished';
        } elseif ($status == 'Pen.') {
            $match['status'] = 'penalties';
        } elseif ($status == 'Postp.') {
            $match['status'] = 'postponed';
        } elseif (preg_match('/[0-9]{2}:[0-9]{2}/', $status)) {
            $match['status'] = 'upcoming';
            $match['begin'] = $status;
        } elseif (strpos($status, "'") !== false) {
            $match['status'] = 'pending';
            $match['time'] = trim($status, "'");
        } else {
            $match['status'] = 'unknown';
        }
        
        $match['team1'] = $td->item(1)->textContent;
        list($score1, $score2) = explode('-', $td->item(2)->textContent);
        $match['team2'] = $td->item(3)->textContent;
        $match['team1score'] = trim($score1);
        $match['team2score'] = trim($score2);
        $match['date'] = $date;
        
        $scores[$league][] = $match;
    }
    
}

print_r($scores);