dsaiztc
2/23/2018 - 9:16 AM

Amazon Redshift Calendar Table

/**********************************************************************************************
Purpose: Uses a python UDF and a CTAS statement to build a calendar dimension table template.
Columns:
id: Unique identifier of date represented as integer (YEAR*10000 + MONTH*100 + DAY)
date: Date represented as date data type
year: Year represented as integer without 0-padding (1900-2049)
month: Month represented as integer without 0-padding (1-12)
day: Day represeted as integer without 0-padding (1-31)
quarter: Quarter represented as integer (1-4)
week: Week represented as integer (1-53)
day_name: Day represented in plain english
month_name: Month represented in plain english
holiday_flag: Boolean for US Federal Holidays
weekend_flag: Boolean for Saturday and Sunday
Notes:
* Feel free to add additional columns to meet your reporting requirements, open issues for help
* Use a custom calendar if interested in holidays rather than US Federal Holidays
* Modify date literals and LIMIT clause if interested in ranges of dates beyond 1900->2049
* Requires 54787 rows in STL_SCAN to generate calendar from 2049-12-31 to 1900-01-01
** Use a different table with that many rows in the date_gen subquery if STL_SCAN is too small
History:
2017-01-19 zach-data (chriz-bigdata) created
**********************************************************************************************/

CREATE OR REPLACE FUNCTION f_holiday(dt DATE)
RETURNS bool
STABLE
AS $$
    import pandas as pd
    from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
    holidays = calendar().holidays(start='1900-01-01', end='2049-12-31')
    return dt in holidays
$$ LANGUAGE plpythonu;

CREATE TABLE dim_calendar DISTSTYLE ALL SORTKEY (id) AS
SELECT
 (DATE_PART('y', date_gen.dt)*10000+DATE_PART('mon', date_gen.dt)*100+DATE_PART('day', date_gen.dt))::int AS "id",
 date_gen.dt AS "date",
 DATE_PART('y', date_gen.dt)::smallint AS "year",
 DATE_PART('mon', date_gen.dt)::smallint AS "month",
 DATE_PART('day', date_gen.dt)::smallint AS "day",
 DATE_PART('qtr', date_gen.dt)::smallint AS "quarter",
 DATE_PART('w', date_gen.dt)::smallint AS "week",
 CASE DATE_PART('dow', date_gen.dt)
  WHEN 0 THEN 'Sunday'
  WHEN 1 THEN 'Monday'
  WHEN 2 THEN 'Tuesday'
  WHEN 3 THEN 'Wednesday'
  WHEN 4 THEN 'Thursday'
  WHEN 5 THEN 'Friday'
  WHEN 6 THEN 'Saturday'
 END::VARCHAR(9) AS "day_name",
 CASE DATE_PART('mon', date_gen.dt)::smallint
  WHEN 1 THEN 'January'
  WHEN 2 THEN 'February'
  WHEN 3 THEN 'March'
  WHEN 4 THEN 'April'
  WHEN 5 THEN 'May'
  WHEN 6 THEN 'June'
  WHEN 7 THEN 'July'
  WHEN 8 THEN 'August'
  WHEN 9 THEN 'September'
  WHEN 10 THEN 'October'
  WHEN 11 THEN 'November'
  WHEN 12 THEN 'December'
 END::VARCHAR(9) AS "month_name",
 f_holiday(date_gen.dt)::boolean AS "holiday_flag",
 CASE  
  WHEN DATE_PART('dow', date_gen.dt)::smallint IN (0,6) THEN TRUE
  ELSE FALSE
 END::boolean AS "weekend_flag"
FROM
(SELECT 
    ('2050-01-01' - n)::date AS dt FROM (SELECT row_number() over () AS n FROM stl_scan LIMIT 54787)) date_gen;