krkr
1/24/2012 - 11:03 PM

LeBonCoin to csv Groovy

LeBonCoin to csv Groovy

#!/usr/bin/groovy

import org.htmlcleaner.*

import static C.*

class C {
	static trucks = ['vito', 'transporter', '']
	static cats = ['utilitaires', 'caravaning'/*, 'voitures'*/]
	static kmMax = "200000"

	static words = ['change', 'Fiat', 'Renault', 'Citroen', 'Romeo', 'Porche', '4x4', 'Bmw', 'Megane', 'break', 'Espace', 'Opel', 'Xsara']
	static depts = ["09", "11", "12", "15", "31", "32", "33", "34", "46", "47", "48", "65", "66", "81", "82"]
	static prices = ['utilitaires': [5, 9], 'caravaning': [9, 13], 'voitures': [10, 18]]
}


trucks.each { t -> cats.each { c -> search(c, t) } }

System.exit(1)

List<Truck> search(category, type) {
	String priceMin = prices.get(category)[0]  // 4 000
	String priceMax = prices.get(category)[1]  // 8 000
	url = "http://www.leboncoin.fr/"+category+"/offres/midi_pyrenees/occasions/?f=a&th=1&ps="+priceMin+"&pe="+priceMax+"&q="+type+"&me="+kmMax
	parseEveryPage(category, type, url, 0)
}


def parseEveryPage(category, type, address, i) {
	i++
	def page = getXml(address)

	page.body.div.div.div.table.tbody.tr.td.table.tbody.tr.each 
		{ tr -> parseTruck(category, type, tr.td.a.@href.text()) }

	def nextPage = ""
	page.body.div.div.div.div.div.span.findAll { it.text() =~ '.*suivante.*' }.each 
		{ sp -> nextPage = sp.a.@href.text() }
	
	if (!nextPage.equals("")) parseEveryPage(category, type, nextPage, i)
}

def parseTruck(category, type, address) {
	def page = getXml(address)
	def title = page.body.div.div.div.div.span.h1.text()

	def map = [:]
	page.body.div.div.div.div.div.div.span.each { sp ->
		split = sp.text().split(" : ")
		map.put(split[0], split[1].replaceAll("\\s+","")); }

	def id = (address =~ /([0-9].*).htm/)[0][1]
	def t = new Truck(id, title, category, type, address, map)
	push(t)
}

def push(truck) {
	// clean bad titles
	def res = words.findAll{ w -> truck.title =~ /$w/ }
	boolean titleValid = res.size() == 0

	// print truck
	if (truck.km < 200001 
	 && truck.isGeoZoneOk 
	 && titleValid)
		println truck
}



def getXml(address) {
	def cleaner = new HtmlCleaner()
	def node = cleaner.clean(address.toURL())
	def props = cleaner.getProperties()
	def serializer = new SimpleXmlSerializer(props)
	def xml = serializer.getXmlAsString(node)
	def page = new XmlSlurper(false, false).parseText(xml)
}

class Truck {
 String title
 int price
 int km
 String year
 String gas
 int cp
 String city
 String type
 String category
 String id
 String url
 boolean isGeoZoneOk
 Map<String, String> map
 //String photos

 public Truck(pid, ptitle, pcategory, ptype, paddress, pmap) {
	id = pid
	title = ptitle.replaceAll(",", ".")
	type = ptype
	category = pcategory
	url = paddress
	map = pmap

	// year
	year = map.get("Année-modèle")
	// price
	price = Integer.parseInt((map.get("Prix") =~ /[0-9]+/)[0])
	// gas
	gas = map.get("Carburant")
	// filter km
	def res = map.get("Kilométrage") =~ /[0-9]+\s*[0-9]*/
	km = res.getCount() == 0 ? -1 : Integer.parseInt(res[0].replaceAll(" ", ""))
	// city
	city = map.get("Ville") 
	// if no city get cp
	city = city == null ? map.get("Code postal") : city
	// filter cp
	cp = Integer.parseInt((city =~ /[0-9]+/)[0])
	// filter dept	
	def dept = (city =~ /[0-9][0-9]/)[0]
	// filter city
	res = (city =~ /[A-Za-z\s-éèàâ\']+/)
	city = res.getCount() == 0 ? "?" : res[0]
	// check geo area
	isGeoZoneOk = depts.contains(dept) 
 }

 String toString() {
	"$title,$price,$km,$year,$gas,$cp,$city,$type,$category,$url"
 }
}