mrbungie
10/20/2016 - 5:07 PM

discretizer_sql_sentence.py

discretizer_sql_sentence.py

def create_discretizer_sql(discr_df, table_name, other_cols=[]):
    from bciutils.beta_v2.transform.sampling import na_filler
    sql_string = "SELECT {other_columns}".format(other_columns=", ".join(other_cols))
    if len(other_cols) > 0:
        sql_string += ',\n'
    columns = list(discr_df["feature_name"].unique())
    for column in columns:
        sql_string += "CASE "
        col_df = discr_df[discr_df["feature_name"] == column]
        categories = list(col_df["category"].fillna('NA').unique())
        last_group = 'NULL'
        if categories != ['NA']:
            for group in col_df.to_dict(orient="records"):
                sql_string += "WHEN {column} = '{category}' THEN {group_num} ".format(column=column, category=group["category"], group_num=group["grupo"]) 
        else:
            alone_numbers_groups = col_df[col_df["max"] == col_df["min"]].to_dict("records")
            for group in alone_numbers_groups:
                if group["min"] == na_filler:
                    sql_string += "WHEN {column} IS NULL THEN {group_num} ".format(column=column, group_num=group["grupo"]) 
                else:
                    sql_string += "WHEN {column} = {number} THEN {group_num} ".format(column=column, number=group["max"], group_num=group["grupo"]) 
            other_groups = col_df[col_df["max"] != col_df["min"]].sort_values("grupo").to_dict("records")
            for group in other_groups:
                if group["min"] == na_filler:
                    sql_string += "WHEN {column} IS NULL OR {column} <= {max} THEN {group_num} ".format(column=column, max=group["max"], min=group["min"], group_num=group["grupo"])
                elif len(other_groups) == 1 or group["grupo"] != col_df[col_df["max"] != col_df["min"]]["grupo"].max():
                    sql_string += "WHEN {column} <= {max} THEN {group_num} ".format(column=column, max=group["max"], group_num=group["grupo"])
                else:
                    last_group = group["grupo"]
        sql_string += "ELSE {group_num} END AS {final_column_name},\n".format(group_num=last_group, final_column_name=column)
    return sql_string[:-2] + "\nFROM {table_name};".format(table_name=table_name)