EdvardM
10/3/2011 - 9:36 AM

My take on Haskell: Functional Programming, Solid Code, Big Data by Bryan O'Sullivan @ Strange Loop 2011

My take on Haskell: Functional Programming, Solid Code, Big Data by Bryan O'Sullivan @ Strange Loop 2011

import Network.HTTP.Enumerator
import Data.ByteString.Lazy.UTF8 as BS
import Text.HTML.TagSoup
import Control.Monad
import Data.Maybe
import Network.URI

download :: String -> IO String
download uri = liftM BS.toString $ simpleHttp uri

savePage :: String -> Int -> IO ()
savePage html i = writeFile (mkFileName i) html

mkFileName :: Int -> FilePath
mkFileName k = "page-" ++ show k ++ ".html"

isOpenTag :: Tag t -> Bool
isOpenTag (TagOpen _ _) = True
isOpenTag _             = False

canonLnk :: String -> String -> Maybe String
canonLnk referer path = do
  r <- parseURI referer
  p <- parseURIReference path
  u <- nonStrictRelativeTo p r
  return (uriToString id u "")

scrapePage :: String -> IO [String]
scrapePage url = liftM (getLinks url) $ download url

getLinks :: String -> String -> [String]
getLinks url =
  catMaybes .
  map (canonLnk url) .
  filter (not . null) .
  hrefs .
  filter (not . noFollow) .
  limitToTags "a" .
  canonicalizeTags . parseTags

hrefs :: [Tag String] -> [String]
hrefs = map $ fromAttrib "href"

noFollow :: Tag String -> Bool
noFollow t = fromAttrib "rel" t == "nofollow"

limitToTags :: String -> [Tag String] -> [Tag String]
limitToTags t = filter $ isTagOpenName t

main :: IO ()
main = do
  parsed <- scrapePage "http://hs.fi/"
  savePage (unwords parsed) 1