Vivian
6/17/2019 - 7:19 AM

product unique purchase - replacing stitch

#standardSQL
-- 1) get data and convert action_type into productdetailviews and uniquepurchase (see hits.eCommerceAction.action_type in bigquery export schema)
-- But there are a lot sessions without hostname. so, we need to create a table with sesssion_id and hostname to fillin missing hostnames.
With non_agg as (
Select date,hits.page.hostname as hostname, 
hits_product.v2ProductCategory as productcategoryhierarchy,
hits_product.productSKU as productsku,
hits.eCommerceAction.action_type,
CASE
  WHEN hits.eCommerceAction.action_type='2' THEN 1
  ELSE 0
END AS product_detail_view,
CASE
  WHEN hits.eCommerceAction.action_type='6' THEN 1
  ELSE 0
END AS completed_purchase,
CONCAT(fullVisitorId,"-",CAST(visitStartTime as STRING)) as session_id
FROM `helloprintbq.134832924.ga_sessions_*` , UNNEST(hits) as hits, UNNEST(hits.product) as hits_product
WHERE _TABLE_SUFFIX BETWEEN '20190418' AND '20190418'
AND (hits.eCommerceAction.action_type = '2' OR hits.eCommerceAction.action_type = '6')
),

-- 2) get all sessions with at leas one hostname
session as (
SELECT 
  CONCAT(fullVisitorId,"-",CAST(visitStartTime as STRING)) as session_id,
  hits.page.hostname as hostname,
  MIN(FORMAT_TIMESTAMP("%Y-%m-%d %H:%M:%S", TIMESTAMP_SECONDS(SAFE_CAST(visitStartTime+hits.time/1000 AS INT64)), "Europe/Amsterdam")) AS min_hit_timestamp
 FROM `helloprintbq.134832924.ga_sessions_*` , UNNEST(hits) as hits
 WHERE _TABLE_SUFFIX BETWEEN '20190418' AND '20190418'
 --AND CONCAT(fullVisitorId,"-",CAST(visitStartTime as STRING)) in ('1005507903462331541-1555584149','1014085683294912125-1555617248')
 AND hits.page.hostname is NOT NULL
 GROUP BY 1,2
 ORDER BY 1),

-- 3) give hostname a sequence number for each session
session_with_num as (
SELECT *,
  ROW_NUMBER() 
    OVER (PARTITION BY session_id ORDER BY hostname) 
    row_num
  FROM session),

-- 4) Removing duplicates from sessions with multiple hostnames. Here we kept the first hostname (alphebetic) for one session. Then we got a table with session_id and hostname for further joining.
unique_session as (SELECT session_id as session_id_b, hostname as hostname_b FROM session_with_num WHERE 
row_num = 1),

-- 5) Join table 1) with table 4) to get hostname. And fill in hostname if it is missing in table 1).
combine as (
SELECT * ,
CASE
  WHEN hostname is NULL THEN hostname_b
  ELSE hostname
END AS hostname_c
FROM non_agg LEFT JOIN unique_session
ON non_agg.session_id = unique_session.session_id_b
),

-- 6) Aggregating product detail views and unique purchases on SKU level
agg as (SELECT
date, 
CASE
  WHEN hostname_c is NULL THEN '(not set)'
  ELSE hostname_c
END as hostname, 
productcategoryhierarchy, productsku,
SUM(product_detail_view) as productdetailviews, SUM(completed_purchase) as uniquepurchase
FROM combine
GROUP BY 1,2,3,4)

SELECT * FROM agg
--WHERE productsku = 'rollupbannersbudget'
ORDER BY 1,2,3,4