Use php and curl to run scroll query against elasticsearch to get all the results of a search query.
<?php
//Only required if memory allocated is less and data size is big.
ini_set('memory_limit', '512M');
//Global configuration
$url = "http://elasticsearchhost.com:9200/elasticsearch_index_attendance/_search";
$port = "9200";
$request_type = "POST";
$headers = array(
"Cache-Control: no-cache",
"Content-Type: application/json",
);
//To hold the merged final data. Not required, if you write data to csv file in each loop.
//It will be faster and more scalable that way.
$final_data ="";
//First function to call, which does the first query.
function init($url,$port,$request_type,$headers){
//The query for first request
$query = "{\n \"size\": \"2000\",\n \"query\": {\n \"bool\": {\n \"must\": [\n { \"match\": { \"type\":\"engagement\" }}\n ]\n }\n }\n}";
//The param to be added along with url
$params = "?scroll=3m";
//Initiate the curl for first call with neccessary data passed
$results = curl_op($url,$params,$port,$query,$headers,$request_type);
//Get the result as php array
$res_arr = json_decode($results);
//Count of data in first query.
$count = count($res_arr->hits->hits);
//Get scroll id from php array
$scroll_id = $res_arr->_scroll_id;
$ran = 1; //Keep track of how many times loop ran. [Optional]
$total_count = $count; //Keep track of total count. Start with current count.[Optional]
$final_data = array();
//Loop run as long as count is greater than 0. If 0 it means no data.
while($count > 0) {
$r = scroll($scroll_id,$port,$request_type,$headers); //Call scroll function and get data using curl.
$r_arr = json_decode($r); //convert to array
$d = $res_arr->hits->hits; //Get data from array
$count = count($r_arr->hits->hits); //Count the data
//Bad Idea. Its better to create the csv in small batches. Slowly in each loop of 2000 items.
//The below code, merges all together and is not the way to go with large dataset.
$final_data = array_merge($final_data, $d);
//For knowing total count. [Optional]
$total_count = $total_count + $count;
print "<br>Run:" . $ran . " Count: " . $count; //Show nth time run & current count. [Optional]
$ran++; //Increase ran counter [Optional]
}
print "<hr><br>Total count: " . $total_count . " Total times ran: " . $ran . "<hr> <h4>Full Data</h4>";
//Browser will crash. As array will have thousands of documents.
//print("<pre>" . print_r($final_data,true) . "</pre>");
}
function scroll($scroll_id,$port,$request_type,$headers){
//IMPORTANT:
//The url should not include the index name in this case.
$url = "http://elasticsearchhost.com:9200/_search";
//Query for scrolling with scroll id dynamically added
$query = "{\n \"scroll\" : \"3m\", \n \"scroll_id\" : \"" . $scroll_id ."\" \n}";
//The param to be added along with url
$params = "/scroll";
//Initiate curl for scroll calss with id
$results = curl_op($url,$params,$port,$query,$headers,$request_type);
return $results;
}
//Function to initiate curl for any needs.
function curl_op($url,$params,$port,$query,$headers,$request_type){
$curl = curl_init();
curl_setopt_array($curl, array(
CURLOPT_PORT => $port,
CURLOPT_URL => $url . $params,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_ENCODING => "",
CURLOPT_MAXREDIRS => 10,
CURLOPT_TIMEOUT => 30,
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
CURLOPT_CUSTOMREQUEST => $request_type,
CURLOPT_POSTFIELDS => $query,
CURLOPT_HTTPHEADER => $headers,
));
$response = curl_exec($curl);
$err = curl_error($curl);
curl_close($curl);
if ($err) {
return "cURL Error #:" . $err;
} else {
// header('Content-Type: application/json');
// print json_encode($data);
return $response;
}
}
//Call the function to run the whole process.
init($url,$port,$request_type,$headers);