#standardSQL
-- 1) get data and convert action_type into productdetailviews and uniquepurchase (see hits.eCommerceAction.action_type in bigquery export schema)
-- But there are a lot sessions without hostname. so, we need to create a table with sesssion_id and hostname to fillin missing hostnames.
With non_agg as (
Select date,hits.page.hostname as hostname,
hits_product.v2ProductCategory as productcategoryhierarchy,
hits_product.productSKU as productsku,
hits.eCommerceAction.action_type,
CASE
WHEN hits.eCommerceAction.action_type='2' THEN 1
ELSE 0
END AS product_detail_view,
CASE
WHEN hits.eCommerceAction.action_type='6' THEN 1
ELSE 0
END AS completed_purchase,
CONCAT(fullVisitorId,"-",CAST(visitStartTime as STRING)) as session_id
FROM `helloprintbq.134832924.ga_sessions_*` , UNNEST(hits) as hits, UNNEST(hits.product) as hits_product
WHERE _TABLE_SUFFIX BETWEEN '20190418' AND '20190418'
AND (hits.eCommerceAction.action_type = '2' OR hits.eCommerceAction.action_type = '6')
),
-- 2) get all sessions with at leas one hostname
session as (
SELECT
CONCAT(fullVisitorId,"-",CAST(visitStartTime as STRING)) as session_id,
hits.page.hostname as hostname,
MIN(FORMAT_TIMESTAMP("%Y-%m-%d %H:%M:%S", TIMESTAMP_SECONDS(SAFE_CAST(visitStartTime+hits.time/1000 AS INT64)), "Europe/Amsterdam")) AS min_hit_timestamp
FROM `helloprintbq.134832924.ga_sessions_*` , UNNEST(hits) as hits
WHERE _TABLE_SUFFIX BETWEEN '20190418' AND '20190418'
--AND CONCAT(fullVisitorId,"-",CAST(visitStartTime as STRING)) in ('1005507903462331541-1555584149','1014085683294912125-1555617248')
AND hits.page.hostname is NOT NULL
GROUP BY 1,2
ORDER BY 1),
-- 3) give hostname a sequence number for each session
session_with_num as (
SELECT *,
ROW_NUMBER()
OVER (PARTITION BY session_id ORDER BY hostname)
row_num
FROM session),
-- 4) Removing duplicates from sessions with multiple hostnames. Here we kept the first hostname (alphebetic) for one session. Then we got a table with session_id and hostname for further joining.
unique_session as (SELECT session_id as session_id_b, hostname as hostname_b FROM session_with_num WHERE
row_num = 1),
-- 5) Join table 1) with table 4) to get hostname. And fill in hostname if it is missing in table 1).
combine as (
SELECT * ,
CASE
WHEN hostname is NULL THEN hostname_b
ELSE hostname
END AS hostname_c
FROM non_agg LEFT JOIN unique_session
ON non_agg.session_id = unique_session.session_id_b
),
-- 6) Aggregating product detail views and unique purchases on SKU level
agg as (SELECT
date,
CASE
WHEN hostname_c is NULL THEN '(not set)'
ELSE hostname_c
END as hostname,
productcategoryhierarchy, productsku,
SUM(product_detail_view) as productdetailviews, SUM(completed_purchase) as uniquepurchase
FROM combine
GROUP BY 1,2,3,4)
SELECT * FROM agg
--WHERE productsku = 'rollupbannersbudget'
ORDER BY 1,2,3,4