require 'nokogiri' require 'open-uri'
doc = Nokogiri::HTML(open('http://www.google.com/search?q=tenderlove'))
doc.css('h3.r a.l').each do |link| puts link.content end
doc.at_css('h3').content
doc.xpath('//h3/a[@class="l"]').each do |link| puts link.content end
doc.search('h3.r a.l', '//h3/a[@class="l"]').each do |link| puts link.content end
xml = "bar"
doc = Nokogiri::XML(xml) doc.at_css("foo").content => "bar" doc.at_css("foo")["wam"].content => "bam"
el = doc.at_css("foo") el.children # => array of elements
So for example if we wanted to know all the names of the food items in our document we simply say:
doc.xpath("//name").collect(&:text) => ["carrot", "tomato", "corn", "grapes", "orange", "pear", "apple"]
If we were interested in the entire node we could leave off the .collect(&:text). What if we wanted to select all the names of food items that were best baked? This requires us to use what’s called an axis – we will first need to find the element “baked” but then go back up our XML elements to find which food the item is inside.
doc.xpath("//tag[text()='baked']/ancestor::node()/name").collect(&:text) => ["pear", "apple"]
What if we were only interested in vegetables that were good for roasting? Just add //veggies:
doc.xpath("//veggies//tag[text()='roasted']/ancestor::node()/name").collect(&:t xt) => ["carrot", "tomato"]
What about if we wanted to know all the tags ‘corn’ had? Again this is very easy:
doc.xpath("//name[text()='corn']/../tags/tag").collect(&:text) => ["raw", "boiled", "grilled"]
We can even do searches matching the first character. Let’s say we wanted to know all the food items that started with the letter ‘c’:
doc.xpath("//name[starts-with(text(),'c')]").collect(&:text) => ["carrot", "corn"]
You could also use [contains(text(),'rot'] and get back just carrot, useful when you want to do a partial match.
node.ancestors # Ancestors for node.at('xpath') # Returns node at given XPATH node.at_css('selector') # Returns node at given CSS selector
node.xpath('xpath') # Returns nodes at given XPATH node.css('selector') # Returns nodes at given selector
node.child # Returns the child node node.children # Returns child nodes node.parent
node.name # Element name node.node_type
node.content # Returns text as string # (aka: .inner_text, .text) node.content = '...'
node.inner_html node.inner_html = '...'
node.attribute_nodes # Returns attributes as nodes node.attributes # Returns attributes as hash
node.add_next_sibling(other) # Place after node.add_previous_sibling(other) # Place before node.add_child(other) # Put inside
node.after(data) # Put a new node after node.before(data) # Put a new node before
node.parent = other # Reparents inside