Svtter
10/22/2017 - 1:04 PM

pandas learn with ipython %logstart

pandas learn with ipython %logstart

# IPython log file

get_ipython().run_line_magic('ls', '')
get_ipython().run_line_magic('cd', '..')
get_ipython().run_line_magic('ls', '')
get_ipython().run_line_magic('mkdir', 'data-science')
get_ipython().run_line_magic('cd', 'data-science/')
get_ipython().run_line_magic('ls', '')
get_ipython().run_line_magic('logstart', '')
get_ipython().run_line_magic('logoff', '')
get_ipython().run_line_magic('ls', '')
get_ipython().run_line_magic('rm', 'ipython_log.py')
get_ipython().run_line_magic('logstart', '-o')
get_ipython().run_line_magic('ls', '')
get_ipython().run_line_magic('logstop', '')
get_ipython().run_line_magic('ls', '')
import pandas as pd
import numpy as np
s = pd.Series([1,3,5,np.nan,44,1])
s
#[Out]# 0     1.0
#[Out]# 1     3.0
#[Out]# 2     5.0
#[Out]# 3     NaN
#[Out]# 4    44.0
#[Out]# 5     1.0
#[Out]# dtype: float64
np.nan
#[Out]# nan
dates = pd.date_range('20160101', periods=6)
datas
dates
#[Out]# DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
#[Out]#                '2016-01-05', '2016-01-06'],
#[Out]#               dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(6 ,4), index=datas, columns=['a', 'b', 'c', 'd'])
df = pd.DataFrame(np.random.randn(6 ,4), index=dates, columns=['a', 'b', 'c', 'd'])
df
#[Out]#                    a         b         c         d
#[Out]# 2016-01-01 -0.145907 -1.792104  0.069336  0.337732
#[Out]# 2016-01-02 -0.369874  0.752100 -0.885715 -1.214447
#[Out]# 2016-01-03  1.199736  0.896241 -0.391990 -0.037802
#[Out]# 2016-01-04 -0.559594 -0.978500 -1.249755 -0.597672
#[Out]# 2016-01-05 -0.791632 -0.764229 -0.062393 -0.195366
#[Out]# 2016-01-06 -0.499364  0.987771  0.599102  1.004094
np.random.randn
#[Out]# <function RandomState.randn>
np.random.randn(6, 4)
#[Out]# array([[-0.78583823, -1.24042436, -0.92060624,  0.01999883],
#[Out]#        [ 0.63252708,  0.44840846, -0.21081448, -1.05017575],
#[Out]#        [ 0.32929554,  0.96843722, -0.56634213, -1.08908219],
#[Out]#        [-1.37842549,  1.05831645,  0.31995612,  0.40955217],
#[Out]#        [-0.70499323,  0.47768134,  0.38585561,  0.39090227],
#[Out]#        [-0.12303527,  0.50538366, -1.53361238,  2.37128177]])
df['b']
#[Out]# 2016-01-01   -1.792104
#[Out]# 2016-01-02    0.752100
#[Out]# 2016-01-03    0.896241
#[Out]# 2016-01-04   -0.978500
#[Out]# 2016-01-05   -0.764229
#[Out]# 2016-01-06    0.987771
#[Out]# Freq: D, Name: b, dtype: float64
df1 = pd.DataFrame(np.arange(12).reshape(3,4))
df1
#[Out]#    0  1   2   3
#[Out]# 0  0  1   2   3
#[Out]# 1  4  5   6   7
#[Out]# 2  8  9  10  11
np.arrange(12).reshape(3,4)
np.arange(12)
#[Out]# array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])
np.arange(12).reshape(3,4)
#[Out]# array([[ 0,  1,  2,  3],
#[Out]#        [ 4,  5,  6,  7],
#[Out]#        [ 8,  9, 10, 11]])
df1
#[Out]#    0  1   2   3
#[Out]# 0  0  1   2   3
#[Out]# 1  4  5   6   7
#[Out]# 2  8  9  10  11
df2 = pd.DataFrame({'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo'})
                    
df2
#[Out]#      A          B    C  D      E    F
#[Out]# 0  1.0 2013-01-02  1.0  3   test  foo
#[Out]# 1  1.0 2013-01-02  1.0  3  train  foo
#[Out]# 2  1.0 2013-01-02  1.0  3   test  foo
#[Out]# 3  1.0 2013-01-02  1.0  3  train  foo
df2.dtypes
#[Out]# A           float64
#[Out]# B    datetime64[ns]
#[Out]# C           float32
#[Out]# D             int32
#[Out]# E          category
#[Out]# F            object
#[Out]# dtype: object
print(df2.index)
df2.columns
#[Out]# Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
df2.values
#[Out]# array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
#[Out]#        [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
#[Out]#        [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
#[Out]#        [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']], dtype=object)
df2.describe()
#[Out]#          A    C    D
#[Out]# count  4.0  4.0  4.0
#[Out]# mean   1.0  1.0  3.0
#[Out]# std    0.0  0.0  0.0
#[Out]# min    1.0  1.0  3.0
#[Out]# 25%    1.0  1.0  3.0
#[Out]# 50%    1.0  1.0  3.0
#[Out]# 75%    1.0  1.0  3.0
#[Out]# max    1.0  1.0  3.0
df2.T
#[Out]#                      0                    1                    2  \
#[Out]# A                    1                    1                    1   
#[Out]# B  2013-01-02 00:00:00  2013-01-02 00:00:00  2013-01-02 00:00:00   
#[Out]# C                    1                    1                    1   
#[Out]# D                    3                    3                    3   
#[Out]# E                 test                train                 test   
#[Out]# F                  foo                  foo                  foo   
#[Out]# 
#[Out]#                      3  
#[Out]# A                    1  
#[Out]# B  2013-01-02 00:00:00  
#[Out]# C                    1  
#[Out]# D                    3  
#[Out]# E                train  
#[Out]# F                  foo  
df2.sort_index(axis=1, ascending=False))
df2.sort_index(axis=1, ascending=False)
#[Out]#      F      E  D    C          B    A
#[Out]# 0  foo   test  3  1.0 2013-01-02  1.0
#[Out]# 1  foo  train  3  1.0 2013-01-02  1.0
#[Out]# 2  foo   test  3  1.0 2013-01-02  1.0
#[Out]# 3  foo  train  3  1.0 2013-01-02  1.0
df.sort_values(by='B')
df2.sort_values(by='B')
#[Out]#      A          B    C  D      E    F
#[Out]# 0  1.0 2013-01-02  1.0  3   test  foo
#[Out]# 1  1.0 2013-01-02  1.0  3  train  foo
#[Out]# 2  1.0 2013-01-02  1.0  3   test  foo
#[Out]# 3  1.0 2013-01-02  1.0  3  train  foo