Btibert3
6/28/2015 - 1:13 PM

Stackoverflow - Load CSV import

Stackoverflow - Load CSV import

// Create the Vendors so we can just match on them - 404 ms, 783 on 2.2
USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS FROM "file:///Users/btibert/Dropbox/Projects/bentley-search-neo4j/data/vendors.csv" AS row
WITH row
MERGE (v:Vendor {name:row.vendors});


// Create the abilities - 164ms, 164
USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS FROM "file:///Users/btibert/Dropbox/Projects/bentley-search-neo4j/data/ability.csv" AS row
WITH row
MERGE (a:Ability {name:row.ability});


// Create the topics - 154 ms, 468 after a huge gap
USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS FROM "file:///Users/btibert/Dropbox/Projects/bentley-search-neo4j/data/topics.csv" AS row
WITH row
MERGE (t:Topic {name:row.topics});


// Load the templates - 554 ms, 543
USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS FROM "file:///Users/btibert/Dropbox/Projects/bentley-search-neo4j/data/templates.csv" AS row
WITH row
MATCH (r:Vendor {name:row.vendor})
WITH row, r
MERGE (p:Template {name:row.template_clean})
MERGE (v:Version {version:row.template_ver})
MERGE (p)-[:FROM_VERSION]->(v)
MERGE (p)-[:CREATED_BY]->(r);


// Associate templates with topics - 239 ms, 233
USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS FROM "file:///Users/btibert/Dropbox/Projects/bentley-search-neo4j/data/template_topic.csv" AS row
WITH row
MATCH (p:Template {name:row.template_clean})
MATCH (t:Topic {name:row.topic})
WITH row, p, t
MERGE (p)-[:HAS_TOPIC]->(t)


// Create the students - 16434 ms, 35964 (77962 students)
USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS FROM "file:///Users/btibert/Dropbox/Projects/bentley-search-neo4j/data/recs.csv" AS row
WITH row
MERGE (s:Student {pidm:toInt(row.pidm), 
                  hash_pidm:toInt(row.hash_pidm), 
                  sid:row.id,
                  race:row.ethn_code,
                  sex:row.sex,
                  major:row.majr_code,
                  rsta:row.rsta_code,
                  sanc:row.sanc_score,
                  sahc:row.sahc_score,
                  hs_state: row.schl_state_code});


// Associate the students with vendors and abilities - 16408 ms, 80404 ms
USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS FROM "file:///Users/btibert/Dropbox/Projects/bentley-search-neo4j/data/recs.csv" AS row
WITH row
MATCH (s:Student {pidm: toInt(row.pidm)} )
MATCH (v:Vendor {name: row.vendor} )
MATCH (a:Ability {name: row.ability} )
WITH row, s, v, a
MERGE (s)-[:PURCHASED_FROM]->(v)
MERGE (s)-[:HAS_ABILITY]->(a);


// Create the contacts - 15782 ms
USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS FROM "file:///Users/btibert/Dropbox/Projects/bentley-search-neo4j/data/cont.csv" AS row
WITH row
MATCH (s:Student {pidm:toInt(row.pidm)} )
WITH row, s
MERGE (c:Contact {code:row.contcode,
                  cid:row.cid, 
                  category:row.contcat, 
                  seqnum:toInt(row.seqnum),
                  timestamp:toInt(row.timestamp)} )
MERGE (s)-[:HAS_CONTACT]->(c);


// First and last contacts - 15959 ms
USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS FROM "file:///Users/btibert/Dropbox/Projects/bentley-search-neo4j/data/cont.csv" AS row
WITH row
MATCH (s:Student {pidm:toInt(row.pidm)}) 
WITH row, s, toInt(row.seqnum) as seq
WHERE seq = 1
MATCH (f:Contact {cid:row.first_cont})
MATCH (l:Contact {cid:row.last_cont})
WITH s, f, l
MERGE (s)-[:FIRST]->(f)
MERGE (s)-[:LAST]->(l);


// Create the contact chain - 1351 ms
USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS FROM "file:///Users/btibert/Dropbox/Projects/bentley-search-neo4j/data/cont.csv" AS row
WITH row, toInt(row.seqnum) as seq
WHERE seq > 1
MATCH (c:Contact {cid:row.cid})
MATCH (p:Contact {cid:row.prev_contact})
WITH c, p
MERGE (p)-[:NEXT]->(c);


// BRM email data
// About 5 minutes for a load of all data from scratch
USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS FROM "file:///Users/btibert/Dropbox/Projects/bentley-search-neo4j/data/brm.csv" AS row
WITH row
MATCH (s:Student {pidm:toInt(row.pidm)}) 
WITH row, s
MERGE (e:Email {refdesc:row.refdesc,
                subject:row.subject, 
                msgid:row.msgid,
                commdate:row.commdate, 
                seqnum:toInt(row.seqnum),
                deliv: toInt(row.delivflag),
                optout: toInt(row.optoutflag),
                open_ind: toInt(row.open_ind),
                opens: toInt(row.opens),
                click_ind: toInt(row.click_ind),
                clicks: toInt(row.clicks)} )
MERGE (s)-[:WAS_SENT]->(e);


// First and last emails
USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS FROM "file:///Users/btibert/Dropbox/Projects/bentley-search-neo4j/data/email_first_last.csv" AS row
WITH row
MATCH (s:Student {pidm:toInt(row.pidm)}) 
MATCH (f:Email {msgid:row.first_email})
MATCH (l:Email {msgid:row.last_email})
WITH s, f, l
MERGE (s)-[:FIRST]->(f)
MERGE (s)-[:LAST]->(l);


// Create the email chain
USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS FROM "file:///Users/btibert/Dropbox/Projects/bentley-search-neo4j/data/email_chain.csv" AS row
WITH row
MATCH (c:Email {cid:row.msgid})
MATCH (p:Email {cid:row.last_email})
WITH c, p
MERGE (p)-[:NEXT]->(c);
// Apply the Constraints
CREATE CONSTRAINT ON (s:Student) ASSERT s.pidm IS UNIQUE;
CREATE CONSTRAINT ON (v:Vendor) ASSERT v.name IS UNIQUE;
CREATE CONSTRAINT ON (t:Topic) ASSERT t.name IS UNIQUE;
CREATE CONSTRAINT ON (e:Email) ASSERT e.msgid IS UNIQUE;
CREATE CONSTRAINT ON (a:Ability) ASSERT a.name IS UNIQUE;
CREATE CONSTRAINT ON (c:Contact) ASSERT c.cid IS UNIQUE;
CREATE CONSTRAINT ON (p:Template) ASSERT p.name IS UNIQUE;
CREATE CONSTRAINT ON (v:Version) ASSERT v.version IS UNIQUE;