moralmar
1/2/2018 - 3:21 PM

Cleaning with Pandas 1

  • cleaning with dictionary

source: Applied Data Science with Python - 01 Introduction

import pandas as pd
import numpy as np

def answer_one():

    energy = pd.read_excel('Energy Indicators.xls')
    energy = energy[16:243]
    energy = energy.drop(energy.columns[[0, 1]], axis=1)
    energy.rename(columns={'Environmental Indicators: Energy': 'Country','Unnamed: 3':'Energy Supply','Unnamed: 4':'Energy Supply per Capita','Unnamed: 5':'% Renewable'}, inplace=True)
    energy.replace('...', np.nan,inplace = True)
    energy['Energy Supply'] *= 1000000
    energy['Country'] = energy['Country'].str.extract('(^[a-zA-Z\s]+)', expand=False).str.strip()  
    di = {"Republic of Korea": "South Korea",
          'United States of America': 'United States',
          "United Kingdom of Great Britain and Northern Ireland": "United Kingdom",
          "China, Hong Kong Special Administrative Region": "Hong Kong"}
    energy.replace({"Country": di},inplace = True)
    # reset index (index came from Excel file)
    energy.reset_index() # somehow, it doesn't do anything
    # energy[40:50]
    
    GDP = pd.read_csv('world_bank.csv', skiprows=4)
    GDP.rename(columns={'Country Name': 'Country'}, inplace=True)
    GDP=GDP[['Country','2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']]
    di = {"Korea, Rep.": "South Korea", 
          "Iran, Islamic Rep.": "Iran",
          "Hong Kong SAR, China": "Hong Kong"}
    GDP.replace({"Country": di},inplace = True)
    #GDP.head()
    
    
    
    ScimEn = pd.read_excel('scimagojr-3.xlsx')
    # print(ScimEn)
    df = pd.merge(pd.merge(energy, GDP, on='Country'), ScimEn, on='Country')
    # We only need 2006-2015 data
    df.set_index('Country', inplace=True)
    df = df[['Rank', 'Documents', 'Citable documents', 'Citations', 'Self-citations', 'Citations per document', 'H index', 'Energy Supply', 'Energy Supply per Capita', '% Renewable', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']]
    df = (df.loc[df['Rank'].isin([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15])])
    df.sort('Rank', inplace=True)
    # df.head()
    #I have to improvise here:
    df.drop_duplicates(['Rank'], keep='first', inplace=True)


    
    return df