pandas learn with ipython %logstart
# IPython log file
get_ipython().run_line_magic('ls', '')
get_ipython().run_line_magic('cd', '..')
get_ipython().run_line_magic('ls', '')
get_ipython().run_line_magic('mkdir', 'data-science')
get_ipython().run_line_magic('cd', 'data-science/')
get_ipython().run_line_magic('ls', '')
get_ipython().run_line_magic('logstart', '')
get_ipython().run_line_magic('logoff', '')
get_ipython().run_line_magic('ls', '')
get_ipython().run_line_magic('rm', 'ipython_log.py')
get_ipython().run_line_magic('logstart', '-o')
get_ipython().run_line_magic('ls', '')
get_ipython().run_line_magic('logstop', '')
get_ipython().run_line_magic('ls', '')
import pandas as pd
import numpy as np
s = pd.Series([1,3,5,np.nan,44,1])
s
#[Out]# 0 1.0
#[Out]# 1 3.0
#[Out]# 2 5.0
#[Out]# 3 NaN
#[Out]# 4 44.0
#[Out]# 5 1.0
#[Out]# dtype: float64
np.nan
#[Out]# nan
dates = pd.date_range('20160101', periods=6)
datas
dates
#[Out]# DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
#[Out]# '2016-01-05', '2016-01-06'],
#[Out]# dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(6 ,4), index=datas, columns=['a', 'b', 'c', 'd'])
df = pd.DataFrame(np.random.randn(6 ,4), index=dates, columns=['a', 'b', 'c', 'd'])
df
#[Out]# a b c d
#[Out]# 2016-01-01 -0.145907 -1.792104 0.069336 0.337732
#[Out]# 2016-01-02 -0.369874 0.752100 -0.885715 -1.214447
#[Out]# 2016-01-03 1.199736 0.896241 -0.391990 -0.037802
#[Out]# 2016-01-04 -0.559594 -0.978500 -1.249755 -0.597672
#[Out]# 2016-01-05 -0.791632 -0.764229 -0.062393 -0.195366
#[Out]# 2016-01-06 -0.499364 0.987771 0.599102 1.004094
np.random.randn
#[Out]# <function RandomState.randn>
np.random.randn(6, 4)
#[Out]# array([[-0.78583823, -1.24042436, -0.92060624, 0.01999883],
#[Out]# [ 0.63252708, 0.44840846, -0.21081448, -1.05017575],
#[Out]# [ 0.32929554, 0.96843722, -0.56634213, -1.08908219],
#[Out]# [-1.37842549, 1.05831645, 0.31995612, 0.40955217],
#[Out]# [-0.70499323, 0.47768134, 0.38585561, 0.39090227],
#[Out]# [-0.12303527, 0.50538366, -1.53361238, 2.37128177]])
df['b']
#[Out]# 2016-01-01 -1.792104
#[Out]# 2016-01-02 0.752100
#[Out]# 2016-01-03 0.896241
#[Out]# 2016-01-04 -0.978500
#[Out]# 2016-01-05 -0.764229
#[Out]# 2016-01-06 0.987771
#[Out]# Freq: D, Name: b, dtype: float64
df1 = pd.DataFrame(np.arange(12).reshape(3,4))
df1
#[Out]# 0 1 2 3
#[Out]# 0 0 1 2 3
#[Out]# 1 4 5 6 7
#[Out]# 2 8 9 10 11
np.arrange(12).reshape(3,4)
np.arange(12)
#[Out]# array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
np.arange(12).reshape(3,4)
#[Out]# array([[ 0, 1, 2, 3],
#[Out]# [ 4, 5, 6, 7],
#[Out]# [ 8, 9, 10, 11]])
df1
#[Out]# 0 1 2 3
#[Out]# 0 0 1 2 3
#[Out]# 1 4 5 6 7
#[Out]# 2 8 9 10 11
df2 = pd.DataFrame({'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo'})
df2
#[Out]# A B C D E F
#[Out]# 0 1.0 2013-01-02 1.0 3 test foo
#[Out]# 1 1.0 2013-01-02 1.0 3 train foo
#[Out]# 2 1.0 2013-01-02 1.0 3 test foo
#[Out]# 3 1.0 2013-01-02 1.0 3 train foo
df2.dtypes
#[Out]# A float64
#[Out]# B datetime64[ns]
#[Out]# C float32
#[Out]# D int32
#[Out]# E category
#[Out]# F object
#[Out]# dtype: object
print(df2.index)
df2.columns
#[Out]# Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
df2.values
#[Out]# array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
#[Out]# [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
#[Out]# [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
#[Out]# [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']], dtype=object)
df2.describe()
#[Out]# A C D
#[Out]# count 4.0 4.0 4.0
#[Out]# mean 1.0 1.0 3.0
#[Out]# std 0.0 0.0 0.0
#[Out]# min 1.0 1.0 3.0
#[Out]# 25% 1.0 1.0 3.0
#[Out]# 50% 1.0 1.0 3.0
#[Out]# 75% 1.0 1.0 3.0
#[Out]# max 1.0 1.0 3.0
df2.T
#[Out]# 0 1 2 \
#[Out]# A 1 1 1
#[Out]# B 2013-01-02 00:00:00 2013-01-02 00:00:00 2013-01-02 00:00:00
#[Out]# C 1 1 1
#[Out]# D 3 3 3
#[Out]# E test train test
#[Out]# F foo foo foo
#[Out]#
#[Out]# 3
#[Out]# A 1
#[Out]# B 2013-01-02 00:00:00
#[Out]# C 1
#[Out]# D 3
#[Out]# E train
#[Out]# F foo
df2.sort_index(axis=1, ascending=False))
df2.sort_index(axis=1, ascending=False)
#[Out]# F E D C B A
#[Out]# 0 foo test 3 1.0 2013-01-02 1.0
#[Out]# 1 foo train 3 1.0 2013-01-02 1.0
#[Out]# 2 foo test 3 1.0 2013-01-02 1.0
#[Out]# 3 foo train 3 1.0 2013-01-02 1.0
df.sort_values(by='B')
df2.sort_values(by='B')
#[Out]# A B C D E F
#[Out]# 0 1.0 2013-01-02 1.0 3 test foo
#[Out]# 1 1.0 2013-01-02 1.0 3 train foo
#[Out]# 2 1.0 2013-01-02 1.0 3 test foo
#[Out]# 3 1.0 2013-01-02 1.0 3 train foo