Get all snapshots from archive.org as list
<?php
require_once 'vendor/autoload.php';
use Goutte\Client;
$url = "http://medoo.in/";
/**
* get first snapshot url from archive.org
* @param string $url url of the archived website
* @param integer $year year of the first snapshot (default: 1996)
* @return string url
* @author Leksat <http://stackoverflow.com/a/11699301/1990745>
*/
function getFirstSnapshot($url, $year = 1996) {
$waybackurl = "https://web.archive.org/web/$year/$url"; // < redirects to first snapshot
$ch = curl_init($waybackurl);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HEADER => true,
CURLOPT_FOLLOWLOCATION => false,
]);
$response = curl_exec($ch);
preg_match_all('/^Location:(.*)$/mi', $response, $matches);
curl_close($ch);
return !empty($matches[1]) ? trim($matches[1][0]) : false;
}
$first_url = getFirstSnapshot($url);
if ($first_url) {
preg_match('/\d{4}+/', $first_url, $firstFoundYear); // find year
$firstFoundYear = (int)$firstFoundYear[0];
} else {
die('Could not find first snapshot');
}
$foundUrls = [];
foreach (range($firstFoundYear, (int)date('Y')) as $year) {
echo ">> Year:" . $year . PHP_EOL;
$client = new Client();
$crawler = $client->request('GET', "https://web.archive.org/web/$year*/$url"); // '*' => show calendar
$maxpage = $crawler->filter('.day a[href^="/web/"][href$="'.$url.'"]')->each(function ($node) use ($foundUrls) {
echo $foundUrls[] = $node->attr('href');
echo PHP_EOL;
});
}
{
"require": {
"fabpot/goutte": "^3.2",
"digitalnature/php-ref": "^1.2"
}
}