jwood803
11/17/2015 - 11:54 PM

Analysing box office success of James Bond films using HTML type provider

Analysing box office success of James Bond films using HTML type provider

#load "packages/FsLab/FsLab.fsx"

open FSharp.Data
open XPlot.GoogleCharts

let bondUrl = "https://en.wikipedia.org/w/index.php?title=List_of_James_Bond_films&oldid=688916363"
type BondProvider = HtmlProvider<"https://en.wikipedia.org/w/index.php?title=List_of_James_Bond_films&oldid=688916363">

let bondWiki = BondProvider.Load(bondUrl)

let boxOffice = 
    let allBoxOffice = 
        [| for row in bondWiki.Tables.``Box office``.Rows ->
            row.Title, row.Year, row.Budget2, row.``Box office 2``, row.``Bond actor`` |]
    allBoxOffice.[1..allBoxOffice.Length-3]
    |> Array.map (fun (titleRaw, yr, bdgt, bo, actorRaw) -> 
        let actor = actorRaw.[actorRaw.Length/2 + 1 .. ]
        let title = 
            match titleRaw |> Seq.tryFindIndex ((=) '!') with
            | Some(idx) -> titleRaw.[idx+1 ..]
            | None -> titleRaw
        title, int yr, float bdgt, float bo, actor)

let rating = 
    let allRatings = 
        [| for row in bondWiki.Tables.``Reception and accolades``.Rows ->
            row.Film, row.``Rotten Tomatoes`` |]
    allRatings.[0..allRatings.Length-2]
    |> Array.map (fun (title, r) -> 
        title, r.[0..r.IndexOf('%')-1] |> float )

let options =
    Options(
        title = "Bond fims - rating and box office",
        hAxis = Axis(title = "Year"),
        vAxis = Axis(title = "Box office (millions $)"),
        bubble = Bubble(textStyle=TextStyle(color="transparent")),
        colors = [| "red"; "gold" |]
    )

Array.map2 (fun (title, yr, bdgt, bo, actor) (_, rt) -> 
                title + " (" + actor + ")", yr, bo, rt, bdgt ) boxOffice rating
|> Chart.Bubble
|> Chart.WithLabels(["Title"; "Year"; "Box office"; "Rating"; "Budget"])
|> Chart.WithOptions(options)

// Use RProvider to replicate the plot from http://opiateforthemass.es/articles/james-bond-film-ratings/
open RProvider
open RProvider.ggplot2

let (++) (plot1:RDotNet.SymbolicExpression) (plot2:RDotNet.SymbolicExpression) = 
    R.``+``(plot1, plot2)

let df = 
    namedParams [
        "Title", box (boxOffice |> Array.map (fun (t, _,_,_,_) -> t))
        "Actor", box (boxOffice |> Array.map (fun (_,_,_,_,a) -> a) |> R.as_factor)
        "Year", box (boxOffice |> Array.map (fun (_,y,_,_,_) -> y))
        "Budget", box (boxOffice |> Array.map (fun (_, _,b,_,_) -> b))
        "BoxOffice", box (boxOffice |> Array.map (fun (_, _,_,b,_) -> b))
        "Rating", box (rating |> Array.map snd)
    ]
    |> R.data_frame

let dfActors = 
    let actorYrs = 
        boxOffice 
        |> Seq.groupBy (fun (_,_,_,_,a) -> a)
        |> Seq.map (fun (a, dt) -> 
            a, Seq.map (fun (_,y,_,_,_) -> y) dt |> Seq.min, Seq.map (fun (_,y,_,_,_) -> y) dt |> Seq.max)
        |> Array.ofSeq
        |> Array.map (fun (a, y1, y2) -> if y1 = y2 then a, y1, y2+1 else a, y1, y2)
    namedParams [
        "Actor", box (Array.map (fun (a,_,_) -> a) actorYrs)
        "YearMin", box (Array.map (fun (_,y,_) -> y) actorYrs)
        "YearMax", box (Array.map (fun (_,_,y) -> y) actorYrs)]
    |> R.data_frame

R.ggplot()
// background rectangles based on actors
++ R.geom__rect( 
    namedParams [
      "data", box dfActors
      "mapping", box (
        R.aes__string(
          namedParams["xmin", box "YearMin"; "xmax", box "YearMax"; "ymin", box "-Inf"; "ymax", box "Inf";
                      "fill", box "Actor"]))
      "alpha", box 0.3])
// write actor names on rectangles
++ R.geom__text(
    namedParams [
      "data", box dfActors
      "mapping", box (
        R.aes__string(
          namedParams["x", box "YearMin"; "y", box (Array.map (fun (_,_,_,b,_) -> b) boxOffice |> Array.max);
                      "label", box "Actor"; "angle", box 90; "hjust", box 1; "vjust", box 1]))
      "alpha", box 0.6
      "size", box 5])
// film names
++ R.geom__text(
    namedParams [
      "data", box df
      "mapping", box (
        R.aes__string(
          namedParams["x", box "Year"; "y", box 0;
                      "label", box "Title"; "angle", box 90; "hjust", box 0; "vjust", box 0.5]))
      "size", box 4])
// film data
++ R.geom__point(
    data=df,
    mapping = R.aes__string(
          namedParams["x", "Year"; "y", "BoxOffice"; "size", "Budget"; "colour", "Rating"]))
// Rotten tomatoes rating gradient
++ R.scale__colour__continuous(
    namedParams["low", "red"; "high", "green"; "name", "Rotten Tomatoes rating"])
// Increase minimum point size for readability
++ R.scale__size__continuous(
    namedParams["name", box "Budget (2005 mil. dollars)"; "range", box [3; 10]])
++ R.theme__bw()
++ R.theme(namedParams["plot.title", R.element__text(lineheight=0.8, face="bold")])
++ R.guides(namedParams["fill", false])
++ R.labs(
    namedParams["title", "Box office results, budgets, and ratings of James Bond films\n"
                "x", ""; "y", "Box office earnings (in 2005 mil. dollars)"])