LeBonCoin to csv Groovy
#!/usr/bin/groovy
import org.htmlcleaner.*
import static C.*
class C {
static trucks = ['vito', 'transporter', '']
static cats = ['utilitaires', 'caravaning'/*, 'voitures'*/]
static kmMax = "200000"
static words = ['change', 'Fiat', 'Renault', 'Citroen', 'Romeo', 'Porche', '4x4', 'Bmw', 'Megane', 'break', 'Espace', 'Opel', 'Xsara']
static depts = ["09", "11", "12", "15", "31", "32", "33", "34", "46", "47", "48", "65", "66", "81", "82"]
static prices = ['utilitaires': [5, 9], 'caravaning': [9, 13], 'voitures': [10, 18]]
}
trucks.each { t -> cats.each { c -> search(c, t) } }
System.exit(1)
List<Truck> search(category, type) {
String priceMin = prices.get(category)[0] // 4 000
String priceMax = prices.get(category)[1] // 8 000
url = "http://www.leboncoin.fr/"+category+"/offres/midi_pyrenees/occasions/?f=a&th=1&ps="+priceMin+"&pe="+priceMax+"&q="+type+"&me="+kmMax
parseEveryPage(category, type, url, 0)
}
def parseEveryPage(category, type, address, i) {
i++
def page = getXml(address)
page.body.div.div.div.table.tbody.tr.td.table.tbody.tr.each
{ tr -> parseTruck(category, type, tr.td.a.@href.text()) }
def nextPage = ""
page.body.div.div.div.div.div.span.findAll { it.text() =~ '.*suivante.*' }.each
{ sp -> nextPage = sp.a.@href.text() }
if (!nextPage.equals("")) parseEveryPage(category, type, nextPage, i)
}
def parseTruck(category, type, address) {
def page = getXml(address)
def title = page.body.div.div.div.div.span.h1.text()
def map = [:]
page.body.div.div.div.div.div.div.span.each { sp ->
split = sp.text().split(" : ")
map.put(split[0], split[1].replaceAll("\\s+","")); }
def id = (address =~ /([0-9].*).htm/)[0][1]
def t = new Truck(id, title, category, type, address, map)
push(t)
}
def push(truck) {
// clean bad titles
def res = words.findAll{ w -> truck.title =~ /$w/ }
boolean titleValid = res.size() == 0
// print truck
if (truck.km < 200001
&& truck.isGeoZoneOk
&& titleValid)
println truck
}
def getXml(address) {
def cleaner = new HtmlCleaner()
def node = cleaner.clean(address.toURL())
def props = cleaner.getProperties()
def serializer = new SimpleXmlSerializer(props)
def xml = serializer.getXmlAsString(node)
def page = new XmlSlurper(false, false).parseText(xml)
}
class Truck {
String title
int price
int km
String year
String gas
int cp
String city
String type
String category
String id
String url
boolean isGeoZoneOk
Map<String, String> map
//String photos
public Truck(pid, ptitle, pcategory, ptype, paddress, pmap) {
id = pid
title = ptitle.replaceAll(",", ".")
type = ptype
category = pcategory
url = paddress
map = pmap
// year
year = map.get("Année-modèle")
// price
price = Integer.parseInt((map.get("Prix") =~ /[0-9]+/)[0])
// gas
gas = map.get("Carburant")
// filter km
def res = map.get("Kilométrage") =~ /[0-9]+\s*[0-9]*/
km = res.getCount() == 0 ? -1 : Integer.parseInt(res[0].replaceAll(" ", ""))
// city
city = map.get("Ville")
// if no city get cp
city = city == null ? map.get("Code postal") : city
// filter cp
cp = Integer.parseInt((city =~ /[0-9]+/)[0])
// filter dept
def dept = (city =~ /[0-9][0-9]/)[0]
// filter city
res = (city =~ /[A-Za-z\s-éèàâ\']+/)
city = res.getCount() == 0 ? "?" : res[0]
// check geo area
isGeoZoneOk = depts.contains(dept)
}
String toString() {
"$title,$price,$km,$year,$gas,$cp,$city,$type,$category,$url"
}
}