poteznyKrolik
2/17/2017 - 5:56 PM

how to calculate the shortest path using actual air routes between all the cities in the NYT 52 Places to visit in 2017

how to calculate the shortest path using actual air routes between all the cities in the NYT 52 Places to visit in 2017

-- Airport and Flight data can be downloaded at: http://openflights.org/data.html
-- you'll need airports.dat and routes.dat
-- (I added a couple of commercial flights that I know exist, 
-- but weren't in the Open Flights data

-- OK, on to the good stuff.

-- The first thing we do is load the top 52 list, which has the place and
-- the nearest commerical airport. (This is data I just figured out by hand. 
-- Where it's very ambiguous, like "Canada" I just used the appropriate capital (i.e. Ottawa))

-- First we load the list

DROP TABLE IF EXISTS the_list;
CREATE TABLE the_list (id integer primary key, name text,city text, state_country text, iata_code text);
\copy the_list FROM STDIN csv header
id,name,City,Country,Code
1,Canada,Ottawa,Canada,YOW
2,"Atacama Desert, Chile",Atacama Desert,Chile,ANF
3,"Agra, India",Agra,India,AGR
4,"Zermatt, Switzerland",Zermatt,Switzerland,GVA
5,Botswana,Gabrone,Botswana,GBE
6,"Dubrovnik, Croatia",Dubrovnik,Croatia,DBV
7,"Grand Teton National Park, Wyoming",Jackson Hole,Wyoming,JAC
8,"Tijuana, Mexico",Tijuana,Mexico,TIJ
9,"Detroit, Michigan",Detroit,Michigan,DTW
10,"Hamburg, Germany",Hamburg,Germany,HAM
11,"Marrakesh, Morocco",Marrakesh,Morocco,RAK
12,"Greenville, South Carolina",Greenville,South Carolina,GSP
13,"Pedregal, Ecuador",Pedregal,Ecuador,UIO
14,"Penzance, England",Penzance,England,LEQ
15,"Osaka, Japan",Osaka,Japan,KIX
16,"Stockholm, Sweden",Stockholm,Sweden,ARN
17,"Sikkim, India",Sikkim,India,IXB
18,"Île de Porquerolles, France",√éle de Porquerolles,France,MRS
19,Madagascar,Antananarivo,Madagascar,TNR
20,"Sanya, China",Sanya,China,SYX
21,Cyprus,Nicosia,Cyprus,LCA
22,"Great Barrier Reef, Australia",Great Barrier Reef,Australia,HTI
23,"Minneapolis, Minnesota",Minneapolis,Minnesota,MSP
24,"Kingston, Jamaica",Kingston,Jamaica,KIN
25,"Comporta, Portugal",Comporta,Portugal,LIS
26,Kazakhstan,Astana,Kazakhstan,TSE
27,Gabon,Libreville,Gabon,LBV
28,"Athens, Greece",Athens,Greece,ATH
29,Northwest Puerto Rico,San Juan,Puerto Rico,SJU
30,"Chiang Mai, Thailand",Chiang Mai,Thailand,CNX
31,"Napa Valley, California",Napa Valley,California,SFO
32,"Puerto Escondido, Mexico",Puerto Escondido,Mexico,OAX
33,"Sedona, Arizona",Sedona,Arizona,PHX
34,Madrid,Madrid,Spain,MAD
35,"Ketchum, Idaho",Ketchum,Idaho,SUN
36,Maldives,Malé,Maldives,MLE
37,"Calabria, Italy",Calabria,Italy,SUF
38,"Antequera, Spain",Antequera,Spain,SVQ
39,"Lofoten Islands, Norway",Lofoten Islands,Norway,LKN
40,"Iberá Wetlands, Argentina",Iberá Wetlands,Argentina,PSS
41,"Istria, Croatia",Istria,Croatia,LJU
42,"Placencia, Belize",Placencia,Belize,BZE
43,"Langtang Region, Nepal",Langtang Region,Nepal,KTM
44,"Bozcaada, Turkey",Bozcaada,Turkey,IST
45,"Birmingham, Alabama",Birmingham,Alabama,BHM
46,"Sacred Valley, Peru",Sacred Valley,Peru,CUZ
47,"Laikipia, Kenya",Laikipia,Kenya,NYK
48,"Busan, South Korea",Busan,South Korea,PUS
49,"Portland, Oregon",Portland,Oregon,PDX
50,"Budapest, Hungary",Budapest,Hungary,BUD
51,"South Bronx, New York",South Bronx,New York,LGA
52,"Ryukyu Islands, Japan",Ryukyu Islands,Japan,OKA
\.


-- a little housekeeping
UPDATE the_list SET name=btrim(name);
ALTER TABLE the_list ADD COLUMN short_name text;
UPDATE the_list SET short_name=regexp_replace(name,E',.*','');

-- load the data

DROP TABLE IF EXISTS airports;
CREATE TABLE airports (
"id" integer primary key,
"name" text,
"city" text,
"country" text,
"iata_code" text,
"icao_code" text,
"lat" double precision,
"long" double precision,
"elevation_ft" numeric,
"utc_offset" text,
"dst" text,
"tz_olson" text
);

\copy airports FROM 'airports.dat' CSV
-- delete from airports where scheduled_service = 'no';
DELETE FROM airports WHERE id=7708; -- duplicate
DELETE FROM airports WHERE iata_code='BFT'; -- duplicate
-- there are no airports on null island
DELETE FROM airports WHERE lat=0 or long=0; 


-- now we want to create a geometry column, and build
-- add update it from the lat,long
ALTER TABLE airports ADD COLUMN geom Geometry(POINT, 4326);
CREATE INDEX ON airports USING gist (geom);
UPDATE airports SET geom=st_setsrid(st_makepoint(long,lat),4326);


--  Now load the routes data
DROP TABLE IF EXISTS routes;
CREATE TABLE routes (
airline text,
airline_id integer,
source text,
source_id integer,
destination text,
destination_id integer,
codeshare text,
stops text,
equipment text);
\copy routes FROM 'routes.dat' CSV

-- do some maintenance
DELETE FROM routes WHERE destination_id IS NULL OR source_id IS NULL;
ALTER TABLE routes ADD COLUMN distance numeric;

-- create the geometry we're going to use to store the line
ALTER TABLE routes ADD COLUMN geom geometry(LINESTRING, 4326);
CREATE INDEX ON routes USING gist (geom);

-- Our route data contains a source and a destination airport
-- we use our airports table to create a line connecting these points.

UPDATE routes SET geom = st_setsrid(ST_MakeLine(s.geom,d.geom) ,4326)
FROM airports s, airports d
WHERE source_id=s.id
AND destination_id=d.id;

-- and calculate the distance between the points using the 
-- WGS 84 spheroid. We'll use this number as the 
-- cost of the route later on.
UPDATE routes SET distance = ST_LengthSpheroid(geom,'SPHEROID["WGS 84",6378137,298.257223563]');

DELETE FROM routes WHERE geom IS NULL;
ALTER TABLE routes add column id serial primary key;


-- Now we set the geometry for the airports we listed in 
-- the Top 52 list. (this is for display purposes later)
ALTER TABLE the_list ADD COLUMN geom geometry(POINT,4326);
UPDATE the_list SET geom = a.geom 
    FROM airports a
    WHERE the_list.iata_code=a.iata_code;


-- this is a table we use for pgrouting. 
-- We assume that all routes are symmetrical,
-- that isn't always true, but we're not trying to 
-- be perfect. 
  
DROP TABLE IF EXISTS edge_table;

CREATE TABLE edge_table as
SELECT distinct s.id as source,
        s.iata_code as source_code,
        d.id as target,
        d.iata_code as target_code,
        r.distance as cost_len,
        r.distance as rcost_len,
        st_x(s.geom) as x1,
        st_y(s.geom) as y1,
        st_x(s.geom) as x2,
        st_y(s.geom) as y2,
        r.distance as to_cost,
        r.geom as the_geom
        FROM routes r
        JOIN airports s on r.source=s.iata_code
        JOIN airports d ON r.destination=d.iata_code;

ALTER TABLE edge_table ADD COLUMN id serial;

-- remove duplicate routes. We are assuming all routes have a return
DELETE FROM edge_table a using edge_table b 
  WHERE b.target_code=a.source_code 
  AND a.target_code=b.source_code 
  AND b.id>a.id; 

-- build the toplogies for pg_routing
SELECT pgr_createTopology(text 'edge_table', 0.001);
SELECT  pgr_analyzeGraph('edge_table',0.001);

-- next actually calculate the route using a traveling salesman
-- heuristic.
-- the result looks like this:
--  seq | node |       cost       |     agg_cost
-- -----+------+------------------+------------------
--    1 |  100 | 708235.222315939 |                0
--    2 | 3645 | 818202.575274533 | 708235.222315939
--    3 | 4034 | 462746.360353078 | 1526437.79759047
--    4 | 3811 | 1374027.79716005 | 1989184.15794355
--    5 | 3858 | 1747832.73113441 |  3363211.9551036
--    6 | 4027 | 686808.077723648 | 5111044.68623801
-- (...)
--   17 | 2812 | 1335899.80733192 |  21131434.884404
--   18 | 2651 | 3081570.78208971 | 22467334.6917359
-- (...)
-- 
--
-- it's only giving us the total cost (i.e. distance) between our
-- top 52 airports, one after the other.
-- so node=100 is Ottawa (YOW), node=3645 is Detroit (DTW), etc.
-- this is our path, in order that we want to travel.
-- however, it didn't tell us the intermediate steps, which is what
-- we're actually interested in.

DROP TABLE IF EXISTS the_route_tsp;
CREATE TABLE the_route_tsp as 
    SELECT * FROM pgr_TSP(
       $$
       SELECT * FROM pgr_dijkstraCostMatrix(
            'SELECT id, source, target, cost_len::double precision as cost, rcost_len::double precision as reverse_cost, x1, y1, x2, y2 FROM edge_table',
        (SELECT array_agg(id)  FROM airports WHERE iata_code IN (SELECT iata_code FROM the_list)),
        directed := false
    )
$$);


-- since we want to build the actual route, including all of the
-- intermediate airports, we do a lateral join, where we calculate
-- the intermediate route for each of our segments using a window function
-- and lead()
-- it returns something like this (leaving off the geometry column)
-- tsp_segment | seq | path_seq | node | edge  |       cost       |     agg_cost     | iata_code | 
-- ------------+-----+----------+------+-------+------------------+------------------+-----------+
--           1 |   1 |        1 |  100 | 27480 | 708235.222315939 |                0 | YOW       |
--           1 |   2 |        2 | 3645 |    -1 |                0 | 708235.222315939 | DTW       |
--           2 |   1 |        1 | 3645 | 32429 | 818202.575274533 |                0 | DTW       |
--           2 |   2 |        2 | 4034 |    -1 |                0 | 818202.575274533 | GSP       |
-- (...)
--          17 |   1 |        1 | 2812 | 19137 | 520410.351913189 |                0 | CUZ       |
--          17 |   2 |        2 | 2762 | 18996 | 492196.939512544 | 520410.351913189 | LPB       |
--          17 |   3 |        3 | 2649 | 18708 | 323292.515906183 | 1012607.29142573 | IQQ       |
--          17 |   4 |        4 | 2651 |    -1 |                0 | 1335899.80733192 | ANF       |
--          18 |   1 |        1 | 2651 | 18709 | 1103072.69271044 |                0 | ANF       |
--          18 |   2 |        2 | 2650 | 18675 | 1149696.52733362 | 1103072.69271044 | SCL       |
--          18 |   3 |        3 | 2442 | 17865 | 828801.562045654 | 2252769.22004406 | AEP       |
--          18 |   4 |        4 | 2474 |    -1 |                0 | 3081570.78208971 | PSS       |
--          19 |   1 |        1 | 2474 | 17865 | 828801.562045654 |                0 | PSS       |
-- (...)

-- what this tells now is that for each hop between our sites, identified 
-- by tsp_segment, there are a number of paths that go between nodes
-- the end of the segment is identified by -1 on the edge.
-- so, for example, the segment YOW -> DTW has direct flight as we can see
-- from node 100 to node 3645. 
-- however if we look at 17 to 18 (CUZ->ANF), we see that we must
-- make two intermediate stops. CUZ->LPB->IOQ->ANF is the actual route.

DROP TABLE IF EXISTS the_route;
CREATE TABLE the_route AS    
SELECT q.row_number AS tsp_segment,star.*,a.iata_code,a.geom FROM 
  (SELECT row_number() OVER(order by seq),node,lead(node,1)  OVER(order by seq) from the_route_tsp) q
  LEFT JOIN LATERAL 
    (SELECT * FROM pgr_dijkstra(
      'SELECT id, source, target, cost_len::double precision as cost,cost_len::double precision as reverse_cost FROM edge_table',
      q.node,
     q.lead
     ) 
    ) AS star
  ON true
  JOIN airports a on a.id=star.node;

-- And that's it, we have our route!

-- However, I want to display this route, and if I do it in 
-- QGIS, all I see is straight lines, not something that
-- might be kind of like what I would actually travel on the globe.
-- The way to fix this is to convert our line to a different projection and
-- break our two-point line into a much greater number
-- of points. With the right choice of projection, that will show
-- a curved line, when returned to the original projection.
-- Ideally, I'd use a gnomonic projection, because all of my 
-- lines would end up as true great circle lines, but for reasons
-- I won't go into here, that's challenging with this particular dataset.
-- instead, let's use an equidistant conic.
-- This isn't defined in the default list of SRIDs, so let's add one ourself

DELETE FROM spatial_ref_sys where srid=954027;
INSERT into spatial_ref_sys (srid, auth_name, auth_srid, proj4text, srtext) values ( 954027, 'esri', 54027, '+proj=eqdc +lat_0=0 +lon_0=0 +lat_1=60 +lat_2=60 +x_0=0 +y_0=0 +ellps=WGS84 +datum=WGS84 +units=m +no_defs ', 'PROJCS["World_Equidistant_Conic",GEOGCS["GCS_WGS_1984",DATUM["WGS_1984",SPHEROID["WGS_1984",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]],PROJECTION["Equidistant_Conic"],PARAMETER["False_Easting",0],PARAMETER["False_Northing",0],PARAMETER["Central_Meridian",0],PARAMETER["Standard_Parallel_1",60],PARAMETER["Standard_Parallel_2",60],PARAMETER["Latitude_Of_Origin",0],UNIT["Meter",1],AUTHORITY["EPSG","54027"]]');

-- Now, what we do here is we smash all of the geometries in our
-- route into a single line, then break insert points such that
-- there is one point at least every 5000 meters.

DROP TABLE if exists the_route_line ;
CREATE TABLE the_route_line AS
      SELECT 1 AS id -- we will only end up with one row, and QGIS expects and integer PK
      ,st_transform -- converts back to 4326
          (st_segmentize( -- insert the extra points
              st_transform( -- convert into our conic projection
                st_setsrid( --  force our new line to 4326
                  st_makeline(geom ORDER BY tsp_segment,r.path_seq)-- build a single line
                ,4326) 
              ,954027)
          , 5000)
      ,4326) AS the_geom
      FROM
      the_route r;
      
-- the geometry shows up without an SRID in the table, but QGIS expects it to have one
ALTER TABLE the_route_line ALTER COLUMN the_geom type geometry(LINESTRING,4326);    


-- this is for aestetic purposes later on QGIS
-- it's just a set of all the points that are used in our route 
DROP TABLE IF EXISTS the_route_points;
CREATE TABLE the_route_points AS SELECT DISTINCT iata_code,geom from the_route;

-- Et Voila! You're done!
-- Load it into QGIS and make it pretty