laika222
10/12/2019 - 5:05 PM

NumPy Overview

####################################
### NumPy ###
####################################

####################################
### ndarray ###
####################################

# ndarray is a generic multidimensional container for homegenous data. All elements must be the same type.
# Each array has a shape, a tuple indicating the size of each dimension, and a dtype, an object describing the data
# type of an array.

import numpy as np

# create a multi-dimensional array
data = np.random.rand(2,3)
data

'''
Result is:

array([[0.56767265, 0.14682219, 0.67090378],
       [0.42464727, 0.75761126, 0.30702302]])
'''

# you can act upon each member in the array
data * 10
data

'''
Result is:

array([[5.67672653, 1.46822186, 6.70903776],
       [4.24647272, 7.57611258, 3.07023025]])
'''

data.shape # result is (2, 3)
data.dtype # result is dtype('float64')

# easiest way to create an array is to use the array function. This accepts any sequence-like object (including other arrays)
# and produces a new NumPy array containing the passed data.

data1 = [6, 7.5, 8, 0, 1]
array1 = np.array(data1)
array1 # result is array([6. , 7.5, 8. , 0. , 1. ])

# you can also use nested lists. Since this is a list of lists, NumPy has two dimensions with shape inferred from the data.
data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]
array2 = np.array(data2)
array2

'''
Result is:

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])
'''

array2.shape # result is (2, 4)

# an ndarray can have a type which you can assign 
array1 = np.array([1, 2, 3], dtype = np.float64)
array1 = np.array([1, 2, 3], dtype = np.int32)

# astype can convert or cast an array as a different type. Calling astype always creates a new array (a copy of the data)
array = np.array([1, 2, 3, 4, 5])
array.dtype # result is dtype('int32')

float_array = array.astype(np.float64)
float_array.dtype # result is dtype('float64')

####################################
### ARRAY ARITHMETIC ###
####################################

# NumPy allows you to express batch operations without writing a for loop. This is called vectorization. Any arithmetic
# operations between equal-size arrays applies the operation element:
arr = np.array([[1., 2., 3.], [4., 5., 6.]])
arr
'''
Result is:

array([[1, 2, 3],
       [4, 5, 6]])
'''

arr * arr
'''
Result is: 

array([[ 1,  4,  9],
       [16, 25, 36]])
'''

arr - arr
'''
Result is: 

array([[0, 0, 0],
       [0, 0, 0]])
'''

# arithmetic with scalars propgate the scalar argument to each element in the array:
1 / arr
'''
Result is: 

array([[1.        , 0.5       , 0.33333333],
       [0.25      , 0.2       , 0.16666667]])
'''

# comparisons between arrays of the same size yield boolean arrays
arr2 = ([[0., 4., 1.], [7., 2., 12.]])

arr2 > arr
'''
Result is: 

array([[False,  True, False],
       [ True, False,  True]])
'''

####################################
### ARRAY INDEXING AND SLICE INDEXING ###
####################################

# arange is similar to range
arr = np.arange(10)
arr # result is array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

arr[5] # result is 5

# you can slice
arr[5:8] # result is array([5, 6, 7])

# you can assign a value to a slice, which will propagate (or broadcast) the value to the entire selection in the slice. 
arr[5:8] = 12
arr # result is array([ 0,  1,  2,  3,  4, 12, 12, 12,  8,  9])

# These modifications are view on the original array, meaning that the data is not copied, and any modifications to the 
# view will be reflected in the source array.
arr_slice = arr[5:8]
arr_slice # array([12, 12, 12])
# now, if you change values in arr_slice, the mutations are reflected in the original array arr
arr_slice[1] = 12345
arr_slice # result is array([   12, 12345,    12])
arr # result is array([    0,     1,     2,     3,     4,    12, 12345,    12,     8,    9])

# the 'bare' slice will assign to all values in an array
arr_slice[:] = 64
arr_slice # result is array([64, 64, 64])
arr # result is array([ 0,  1,  2,  3,  4, 64, 64, 64,  8,  9])

# explicitly copy a value, use copy()
copied_arr = arr[0:4].copy()
copied_arr # result is array([0, 1, 2, 3])
arr[1] = 325532
arr[0:4] # result is array([     0, 325532,      2,      3])
copied_arr # note this hasn't changed since it was copied, result is array([0, 1, 2, 3])

# in higher dimensional arrays, elements at each index are one-dimensional arrays, and individual elements can be
# accessed either recursively or as a comma separated list of indicies. You can think of indexing on a two-dimensional array
# as the first number being the 'row' and the second number being the 'column'. Therefore [0][2] will access the first row 
# and the third column
arr2d = np.array([[1, 2 ,3], [4, 5, 6], [7, 8, 9]])
arr2d 
'''
Result showing rows and columns:

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])
'''
arr2d[2] # result is array([7, 8, 9]), which is the third 'row'
arr2d[0][2] # method 1 of accessing an individual element in first 'row' and third 'column', result is 3
arr2d[0, 2] # method 2 of accessing an individual element in first 'row' and third 'column', result is 3
arr2d[:2] # example of a slice, kind of like saying give me the first two rows
'''
Result is:

array([[1, 2, 3],
       [4, 5, 6]])
'''

arr2d[:2, 1:] # example of multiple slices, kind of like saying give me up to the first two rows and the second column and all following
'''
Result is:

array([[2, 3],
       [5, 6]])
'''

arr2d[1, :2] # example of a slice and a normal index, asking for the second row and the first two columns, result is array([4, 5])

arr2d[:, :1] # example of a bare slice with another slice, with : pulling all rows and :1 pulling first column
'''
Result is:

array([[1],
       [4],
       [7]])
'''

arr2d[:, :1] = 0 # you can also make mutations in a similar fashion - this sets the first column to all zeroes
arr2d
'''
Result is:

array([[0, 2, 3],
       [0, 5, 6],
       [0, 8, 9]])
'''

# in multi-dimensional arrays, if you omit later indices, the returned object will be a lower dimenisional ndarray
# consisting of all the data along the higher dimensions
arr3d = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
arr3d
'''
Result is three dimensional array:
array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])
'''
arr3d[0][1][2] # result is 6
arr3d[0, 1, 2] # result is 6
arr3d[0]
'''
Result is:

array([[1, 2, 3],
       [4, 5, 6]])
'''

# both scalar values and arrays can be assigned to arr3d[0]
arr3d[0] = 42
arr3d
'''
Result is:

array([[[42, 42, 42],
        [42, 42, 42]],

       [[ 7,  8,  9],
        [10, 11, 12]]])
'''

####################################
### BOOLEAN INDEXING ###
####################################

names = np.array(['Soren', 'Liv', 'Thumbs', 'Soren', 'Liv', 'Liv', 'Will'])
data = np.random.randn(7,4)
data
'''
Result is:

array([[-0.1211117 , -1.22088429,  0.10621492, -0.81788831],
       [-0.6880321 , -0.67625407, -0.38376663, -0.81110558],
       [-1.03575407,  0.08275155,  1.42818308,  0.75963265],
       [ 1.13184063,  0.63988033, -0.12147271, -0.55390123],
       [-0.74096649, -1.33766245, -0.71460405,  0.21090558],
       [-0.6125883 ,  1.57837986, -0.78419712, -1.28611674],
       [ 1.24892667, -0.77049204, -0.53295673, -2.868826  ]])

'''

# if you look for values in the array being equal to some value, it returns a boolean array, 
names == 'Soren' # result is array([ True, False, False,  True, False, False, False])

# this boolean array can also be passed to other data, this is the same as looking at data[:2] to pull first two rows (two Trues).
# For this to work, the boolean array must be of the same length as the array it's indexing.
data[names == 'Soren']
data[:2]
'''
Results of both of the above are:

array([[-0.1211117 , -1.22088429,  0.10621492, -0.81788831],
       [ 1.13184063,  0.63988033, -0.12147271, -0.55390123]])
'''

# to negate the condition, you can use != or ~
data[names != 'Soren']
data[~(names == 'Soren')]
'''
Results of both of the above are:

array([[-0.6880321 , -0.67625407, -0.38376663, -0.81110558],
       [-1.03575407,  0.08275155,  1.42818308,  0.75963265],
       [-0.74096649, -1.33766245, -0.71460405,  0.21090558],
       [-0.6125883 ,  1.57837986, -0.78419712, -1.28611674],
       [ 1.24892667, -0.77049204, -0.53295673, -2.868826  ]])
'''

# boolean operators can also be combined using & (and) or | (or)
mask = (names == 'Soren') | (names == 'Will')
mask # result is array([ True, False, False,  True, False, False,  True])
mask2 = (names == 'Soren') & (names == 'Will')
mask2 # result is array([False, False, False, False, False, False, False])

####################################
### TRANSFORMING ARRAYS AND SWAPPING AXES ###
####################################

arr = np.arange(15).reshape((3, 5))
arr
'''
Result is:

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])
'''

# the T method allows you to transpose the matrix
arr.T
'''
Result is:

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])
'''

# for higher dimensional arrays, transpose will accept a tuple of axis numbers to permute the axes

arr = np.arange(16).reshape((2, 2, 4))
'''
Result is:

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]]])
'''

# for higher dimensional arrays, transpose will accept a tuple of axis numbers to permute the axes.
# In this case, the axes will be reorded with the second axis first, the first axis second, and the last axis unchanged.
# The tuple is saying, list index [1], then [0], then [2].
arr.transpose((1, 0, 2))
'''
Result is:

array([[[ 0,  1,  2,  3],
        [ 8,  9, 10, 11]],

       [[ 4,  5,  6,  7],
        [12, 13, 14, 15]]])

Prior to this, the result was:

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]]])
'''

# you can also use swapaxes, which takes a pair of axis numbers and switches the indicated axes to rearrange the data
arr = np.arange(16).reshape((2, 2, 4))
arr
'''
Result is:

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]]])
'''

arr.swapaxes(1,2)
'''
Result is:

array([[[ 0,  4],
        [ 1,  5],
        [ 2,  6],
        [ 3,  7]],

       [[ 8, 12],
        [ 9, 13],
        [10, 14],
        [11, 15]]])
'''

####################################
### UNIVERSAL FUNCTIONS: FAST ELEMENT-WISE ARRAY FUNCTIONS ###
####################################

# A universal function or ufunc is a function that performs element-wise operations on data in ndarrays.
# The first type is a unary ufunc that make simple element-wise transformations:

arr = np.arange(10)
arr # result is array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

np.sqrt(arr) 

'''
Result is: 
array([0.        , 1.        , 1.41421356, 1.73205081, 2.        , 
       2.23606798, 2.44948974, 2.64575131, 2.82842712, 3.        ])
'''

np.exp(arr)

'''
Result is: 
array([1.00000000e+00, 2.71828183e+00, 7.38905610e+00, 2.00855369e+01,
       5.45981500e+01, 1.48413159e+02, 4.03428793e+02, 1.09663316e+03,
       2.98095799e+03, 8.10308393e+03])
'''

# The second type is a binary ufunc such as add or maximum that take two arrays and return a single array as a result:

first_array = np.random.randn(8)
second_array = np.random.randn(8)

first_array
''' Result is:
array([ 1.90604747,  0.27508265,  0.82439986, -0.19178404, -0.9814238 ,
        0.9802765 , -1.2675382 , -0.94728439])
'''

second_array
''' Result is:
array([ 1.77877677, -0.35833601, -0.27337685, -0.17301413, -0.5580396 ,
        0.74550886,  0.42002197, -1.24682839])
'''

# maximum which will return the maximum array
np.maximum(first_array, second_array)

''' Result is:
array([ 1.90604747,  0.27508265,  0.82439986, -0.17301413, -0.5580396 ,
        0.9802765 ,  0.42002197, -0.94728439])
'''

# add which will add the two arrays
np.add(first_array, second_array)

''' Result is:
array([ 3.68482425, -0.08325337,  0.55102301, -0.36479817, -1.53946339,
        1.72578536, -0.84751623, -2.19411278])
'''

# ufuncs accept an option out argument that allows them to operate in-place on the arrays

myarray = np.random.randn(10)
myarray
''' Result is:
array([-0.57738876, -0.35990644,  1.06458676, -0.17363093,  1.14876476,
        1.81621875,  0.50279201,  2.43921165, -0.52994033, -1.10970804])
'''

np.sqrt(myarray, myarray) # the second argument is the out argument
''' Result is:
array([       nan,        nan, 1.03178814,        nan, 1.07180444,
       1.3476716 , 0.70907828, 1.56179757,        nan,        nan])
'''

####################################
### ARRAY-ORIENTED PROGRAMMING WITH ARRAYS ###
####################################

# numpy arrays allow you to express data processing tasks as array operations that might otherwise require writing loops.
# The practice of replacing explicit loops with array expressions is refered to as vecorization. 
# Vectorization array operations will often be faster than their pure Python equivalents.

# As an example, say you want to evaluate the function sqrt(x^2 + y^2 across a grid of values. You could use the np.meshgrid
# function to take two 1D arrays and produce two 2D matrices corresponsing to all pairs of (x, y) in the two arrays.

import numpy as np
points = np.arange(-5, 5, 0.01) # 1000 equally spaced points

xs, ys = np.meshgrid(points, points)
ys
''' Result is:

array([[-5.  , -5.  , -5.  , ..., -5.  , -5.  , -5.  ],
       [-4.99, -4.99, -4.99, ..., -4.99, -4.99, -4.99],
       [-4.98, -4.98, -4.98, ..., -4.98, -4.98, -4.98],
       ...,
       [ 4.97,  4.97,  4.97, ...,  4.97,  4.97,  4.97],
       [ 4.98,  4.98,  4.98, ...,  4.98,  4.98,  4.98],
       [ 4.99,  4.99,  4.99, ...,  4.99,  4.99,  4.99]])
'''

# Now you can evalutae the function in the same way as you would write it with two points:

z = np.sqrt(xs ** 2 + ys ** 2)
z
''' Result is:

array([[7.07106781, 7.06400028, 7.05693985, ..., 7.04988652, 7.05693985,
        7.06400028],
       [7.06400028, 7.05692568, 7.04985815, ..., 7.04279774, 7.04985815,
        7.05692568],
       [7.05693985, 7.04985815, 7.04278354, ..., 7.03571603, 7.04278354,
        7.04985815],
       ...,
       [7.04988652, 7.04279774, 7.03571603, ..., 7.0286414 , 7.03571603,
        7.04279774],
       [7.05693985, 7.04985815, 7.04278354, ..., 7.03571603, 7.04278354,
        7.04985815],
       [7.06400028, 7.05692568, 7.04985815, ..., 7.04279774, 7.04985815,
        7.05692568]])
'''

# you can then visualize the array
import matplotlib.pyplot as plt
plt.imshow(z, cmap=plt.cm.gray); plt.colorbar()
plt.title("This is my plot of square roots or something")
plt.show()

####################################
### EXPRESSING CONDITIONAL LOGIC AS ARRAY OPERATIONS ###
####################################

# numpy.where is teh vectorized version of the ternary expression 'x if condition else y'. Say you have 
# a boolean array and two arrays of values, and you want to take a value from first_array whenever cond is
# True, otherwise take the value from second_array

first_array = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
second_array = np.array([2.1, 2.2, 2.3, 2.4, 2.5])
cond = np.array([True, False, True, True, False])

# you could do this with a list comprehension, but it wouldn't be very fast, and it wouldn't work on 
# multi-dimensional arrays
result = [(x if c else y)
            for x, y, c in zip(first_array, second_array, cond)]
result # result is [1.1, 2.2, 1.3, 1.4, 2.5]

# that said, you could do this more quickly with np.where
np.where(cond, first_array, second_array) # check conditional statement (in this case boolean values contained in the cond array), and if True then select from first_array, else select from second_array
result #result is [1.1, 2.2, 1.3, 1.4, 2.5]

# another example is to take an array and replace the values based on whether it is positive or negative.
# In this example, the second and third arguments are not arrays as in the previous example, but are instead scalar values.
myarray = np.random.randn(4,4)
myarray

''' Result is:
array([[-0.9417623 , -0.22975052,  1.35398507, -0.95858373],
       [-0.38293323, -0.7663218 , -0.14112195, -0.28233403],
       [ 1.25456155, -0.20698571,  3.32226647, -0.64237404],
       [-0.32100475, -0.06935425, -0.59368462, -0.20792726]])
'''

# Now say you want to evaluate each position and see if it is positive or negative
myarray > 0

''' Result is:
array([[False, False,  True, False],
       [False, False, False, False],
       [ True, False,  True, False],
       [False, False, False, False]])
'''

# You could then use that conditional statement in np.where to replace the values
np.where(myarray > 0, 2, -2) # if conditional statement is true for value in myarray, replace with 2, else replace with -2

''' Result is:
array([[-2, -2,  2, -2],
       [-2, -2, -2, -2],
       [ 2, -2,  2, -2],
       [-2, -2, -2, -2]])
'''

####################################
### MATHEMATICAL AND STATISTICAL METHODS ###
####################################

arr = np.array ([[1,4,3,6,3],
                      [4,6,4,9,2],
                      [8,3,1,6,3],
                      [5,4,4,5,2],
                      [7,5,4,6,6]],
                      )

# you can get aggregate stats of the members of the array
arr.mean() # result is 4.44, mean of all elements
np.mean(arr) # result is 4.44
arr.sum() # 111, sum of all elements

# there is an optional axis argument that computes the statistic over the given access, resulting in 
# an array with one fewer dimension. In this case, it'll get the mean of each second dimension member of the array.
# For example, arr.mean(1) means 'compute mean across the columns)
arr.sum(axis = 1) # result is array([17, 25, 21, 20, 28]), sum of all sub elements
# arr.sum(0) takes the sum of the first member of each sub member (in this case, 1, 4, 8, 5, 7)
arr.sum(axis = 0) # result is array([-4.13822559,  0.9767256 , -1.32725686, -1.02620234])

arr.cumsum()

''' Result is the cumulative sum of all elements:
array([  1,   5,   8,  14,  17,  21,  27,  31,  40,  42,  50,  53,  54,
        60,  63,  68,  72,  76,  81,  83,  90,  95,  99, 105, 111],
      dtype=int32)
'''

# with axis, it'll return the same number of dimensions, but partion by the sub elements
arr.cumsum(axis = 1)

''' Result is the cumulative sum of all elements:
array([[ 1,  5,  8, 14, 17],
       [ 4, 10, 14, 23, 25],
       [ 8, 11, 12, 18, 21],
       [ 5,  9, 13, 18, 20],
       [ 7, 12, 16, 22, 28]], dtype=int32)
'''

arr.cumprod()

''' Result is the cumulative sum of all elements:
array([          1,           4,          12,          72,         216,
               864,        5184,       20736,      186624,      373248,
           2985984,     8957952,     8957952,    53747712,   161243136,
         806215680, -1070104576,    14548992,    72744960,   145489920,
        1018429440,   797179904, -1106247680,  1952448512, -1170210816],
      dtype=int32)
'''

arr.cumprod(axis = 1)

''' Result is the cumulative sum of all elements:
array([[   1,    4,   12,   72,  216],
       [   4,   24,   96,  864, 1728],
       [   8,   24,   24,  144,  432],
       [   5,   20,   80,  400,  800],
       [   7,   35,  140,  840, 5040]], dtype=int32)
'''

####################################
### BOOLEAN ARRAY METHODS ###
####################################

# boolean values are coerced to 1 (True) and 0 (False). Therefore, sum is often used as a means of
# counting True values in a boolean aray.

arr = np.random.randn(100)
(arr > 0).sum() # result is the number of positive values in the array, or 54 in this case

# any tests whether one or more values in an array is True, while all tests if they're all True

boolean_array = np.array([False, False, True, False])
boolean_array.any() # result is True
boolean_array.all() # result is False

####################################
### SORTING AN ARRAY ###
####################################

arr = np.random.randn(6)
arr # result is array([-1.29540316,  0.920679  , -0.27233344,  0.26339028, -2.00256255, 2.36187669])
arr.sort() # sorts the array
arr # result is array([-2.00256255, -1.29540316, -0.27233344,  0.26339028,  0.920679, 2.36187669])

arr = np.random.randn(5,3)
arr

''' Result is:

array([[-0.23198357,  0.88090255, -2.52562337],
       [ 0.15980551, -0.39624047, -0.58295457],
       [-0.06606336,  1.39889203,  0.86527283],
       [ 0.84156916, -0.36808526, -0.72742063],
       [ 0.07780976, -1.32896086, -1.21030049]])
'''

arr.sort(1) # sort the subelements (or second dimension elements)
arr

''' Result is:

array([[-2.52562337, -0.23198357,  0.88090255],
       [-0.58295457, -0.39624047,  0.15980551],
       [-0.06606336,  0.86527283,  1.39889203],
       [-0.72742063, -0.36808526,  0.84156916],
       [-1.32896086, -1.21030049,  0.07780976]])
'''

arr.sort(0) # sort the main elements (or 1st dimension elements
arr

''' Result is:

array([[-2.52562337, -1.21030049,  0.07780976],
       [-1.32896086, -0.39624047,  0.15980551],
       [-0.72742063, -0.36808526,  0.84156916],
       [-0.58295457, -0.23198357,  0.88090255],
       [-0.06606336,  0.86527283,  1.39889203]])
'''

####################################
### SET LOGIC ###
####################################

# getting unique values in an array
names = np.array(['Soren', 'Liv', 'Will', 'Soren', 'James'])
sorted(set(names)) # pure Python, returns unique names using set()
np.unique(names) # NumPy alternative, result is array(['James', 'Liv', 'Soren', 'Will'], dtype='<U5')

# testing if a value exists in an array, in this case searching for 2, 3 or 6 and returning a boolean array
values = np.array([6, 0, 0, 3, 2, 5, 6])
np.in1d(values, [2, 3, 6]) # list of 2, 3, or 6 is what we're searching for, result is array([ True, False, False,  True,  True, False,  True])

####################################
### PSEUDORANDOM NUMBER GENERATION ###
####################################

# numpy.random supplements the built-in Python random with functions for efficiently generating whoe arrays of sample
# values from many kinds of probability distributions. For example, you can get a 4 x 4 array of samples from the 
# standard normal distribution using normal
samples = np.random.normal(size=(4,4))
samples

''' Result is:

array([[ 1.75657806,  1.85279794,  0.13050136, -0.23823481],
       [ 0.40257826,  0.90579335,  1.0795258 , -0.41758298],
       [ 0.18967166, -1.14146864, -0.39585093, -0.28985462],
       [-1.6696204 ,  1.31926929,  0.42371635, -0.30972391]])

'''

# the numbers are pseudorandom since they're generated by an algorithm with deterministic behavior
# based on the seed of the random number generatore. You can change the NumPy seed:
np.random.seed(1234)

####################################
### WORKING EXAMPLE - RANDOM WALKS ###
####################################

# this is the way to create a random walk plot of values incrementing up by 1 or down by -1 randomly
import random
import matplotlib.pyplot as plt
position = 0
walk = [position]
steps = 1000
for i in range(steps):
    step = 1 if random.randint(0,1) else -1
    position += step
    walk.append(position)

plt.plot(walk[:100]) # create the plot
plt.show() # show the plot

# however, you could also assume that the walk is simply the cumulative sum of the random steps
# and could be evaluated as an array expression which will give you the same result
nsteps = 1000
draws = np.random.randint(0,2,size = nsteps)
steps = np.where(draws > 0, 1, -1)
walk = steps.cumsum()

plt.plot(walk)
plt.show()