####################################
### NumPy ###
####################################
####################################
### ndarray ###
####################################
# ndarray is a generic multidimensional container for homegenous data. All elements must be the same type.
# Each array has a shape, a tuple indicating the size of each dimension, and a dtype, an object describing the data
# type of an array.
import numpy as np
# create a multi-dimensional array
data = np.random.rand(2,3)
data
'''
Result is:
array([[0.56767265, 0.14682219, 0.67090378],
[0.42464727, 0.75761126, 0.30702302]])
'''
# you can act upon each member in the array
data * 10
data
'''
Result is:
array([[5.67672653, 1.46822186, 6.70903776],
[4.24647272, 7.57611258, 3.07023025]])
'''
data.shape # result is (2, 3)
data.dtype # result is dtype('float64')
# easiest way to create an array is to use the array function. This accepts any sequence-like object (including other arrays)
# and produces a new NumPy array containing the passed data.
data1 = [6, 7.5, 8, 0, 1]
array1 = np.array(data1)
array1 # result is array([6. , 7.5, 8. , 0. , 1. ])
# you can also use nested lists. Since this is a list of lists, NumPy has two dimensions with shape inferred from the data.
data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]
array2 = np.array(data2)
array2
'''
Result is:
array([[1, 2, 3, 4],
[5, 6, 7, 8]])
'''
array2.shape # result is (2, 4)
# an ndarray can have a type which you can assign
array1 = np.array([1, 2, 3], dtype = np.float64)
array1 = np.array([1, 2, 3], dtype = np.int32)
# astype can convert or cast an array as a different type. Calling astype always creates a new array (a copy of the data)
array = np.array([1, 2, 3, 4, 5])
array.dtype # result is dtype('int32')
float_array = array.astype(np.float64)
float_array.dtype # result is dtype('float64')
####################################
### ARRAY ARITHMETIC ###
####################################
# NumPy allows you to express batch operations without writing a for loop. This is called vectorization. Any arithmetic
# operations between equal-size arrays applies the operation element:
arr = np.array([[1., 2., 3.], [4., 5., 6.]])
arr
'''
Result is:
array([[1, 2, 3],
[4, 5, 6]])
'''
arr * arr
'''
Result is:
array([[ 1, 4, 9],
[16, 25, 36]])
'''
arr - arr
'''
Result is:
array([[0, 0, 0],
[0, 0, 0]])
'''
# arithmetic with scalars propgate the scalar argument to each element in the array:
1 / arr
'''
Result is:
array([[1. , 0.5 , 0.33333333],
[0.25 , 0.2 , 0.16666667]])
'''
# comparisons between arrays of the same size yield boolean arrays
arr2 = ([[0., 4., 1.], [7., 2., 12.]])
arr2 > arr
'''
Result is:
array([[False, True, False],
[ True, False, True]])
'''
####################################
### ARRAY INDEXING AND SLICE INDEXING ###
####################################
# arange is similar to range
arr = np.arange(10)
arr # result is array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
arr[5] # result is 5
# you can slice
arr[5:8] # result is array([5, 6, 7])
# you can assign a value to a slice, which will propagate (or broadcast) the value to the entire selection in the slice.
arr[5:8] = 12
arr # result is array([ 0, 1, 2, 3, 4, 12, 12, 12, 8, 9])
# These modifications are view on the original array, meaning that the data is not copied, and any modifications to the
# view will be reflected in the source array.
arr_slice = arr[5:8]
arr_slice # array([12, 12, 12])
# now, if you change values in arr_slice, the mutations are reflected in the original array arr
arr_slice[1] = 12345
arr_slice # result is array([ 12, 12345, 12])
arr # result is array([ 0, 1, 2, 3, 4, 12, 12345, 12, 8, 9])
# the 'bare' slice will assign to all values in an array
arr_slice[:] = 64
arr_slice # result is array([64, 64, 64])
arr # result is array([ 0, 1, 2, 3, 4, 64, 64, 64, 8, 9])
# explicitly copy a value, use copy()
copied_arr = arr[0:4].copy()
copied_arr # result is array([0, 1, 2, 3])
arr[1] = 325532
arr[0:4] # result is array([ 0, 325532, 2, 3])
copied_arr # note this hasn't changed since it was copied, result is array([0, 1, 2, 3])
# in higher dimensional arrays, elements at each index are one-dimensional arrays, and individual elements can be
# accessed either recursively or as a comma separated list of indicies. You can think of indexing on a two-dimensional array
# as the first number being the 'row' and the second number being the 'column'. Therefore [0][2] will access the first row
# and the third column
arr2d = np.array([[1, 2 ,3], [4, 5, 6], [7, 8, 9]])
arr2d
'''
Result showing rows and columns:
array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
'''
arr2d[2] # result is array([7, 8, 9]), which is the third 'row'
arr2d[0][2] # method 1 of accessing an individual element in first 'row' and third 'column', result is 3
arr2d[0, 2] # method 2 of accessing an individual element in first 'row' and third 'column', result is 3
arr2d[:2] # example of a slice, kind of like saying give me the first two rows
'''
Result is:
array([[1, 2, 3],
[4, 5, 6]])
'''
arr2d[:2, 1:] # example of multiple slices, kind of like saying give me up to the first two rows and the second column and all following
'''
Result is:
array([[2, 3],
[5, 6]])
'''
arr2d[1, :2] # example of a slice and a normal index, asking for the second row and the first two columns, result is array([4, 5])
arr2d[:, :1] # example of a bare slice with another slice, with : pulling all rows and :1 pulling first column
'''
Result is:
array([[1],
[4],
[7]])
'''
arr2d[:, :1] = 0 # you can also make mutations in a similar fashion - this sets the first column to all zeroes
arr2d
'''
Result is:
array([[0, 2, 3],
[0, 5, 6],
[0, 8, 9]])
'''
# in multi-dimensional arrays, if you omit later indices, the returned object will be a lower dimenisional ndarray
# consisting of all the data along the higher dimensions
arr3d = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
arr3d
'''
Result is three dimensional array:
array([[[ 1, 2, 3],
[ 4, 5, 6]],
[[ 7, 8, 9],
[10, 11, 12]]])
'''
arr3d[0][1][2] # result is 6
arr3d[0, 1, 2] # result is 6
arr3d[0]
'''
Result is:
array([[1, 2, 3],
[4, 5, 6]])
'''
# both scalar values and arrays can be assigned to arr3d[0]
arr3d[0] = 42
arr3d
'''
Result is:
array([[[42, 42, 42],
[42, 42, 42]],
[[ 7, 8, 9],
[10, 11, 12]]])
'''
####################################
### BOOLEAN INDEXING ###
####################################
names = np.array(['Soren', 'Liv', 'Thumbs', 'Soren', 'Liv', 'Liv', 'Will'])
data = np.random.randn(7,4)
data
'''
Result is:
array([[-0.1211117 , -1.22088429, 0.10621492, -0.81788831],
[-0.6880321 , -0.67625407, -0.38376663, -0.81110558],
[-1.03575407, 0.08275155, 1.42818308, 0.75963265],
[ 1.13184063, 0.63988033, -0.12147271, -0.55390123],
[-0.74096649, -1.33766245, -0.71460405, 0.21090558],
[-0.6125883 , 1.57837986, -0.78419712, -1.28611674],
[ 1.24892667, -0.77049204, -0.53295673, -2.868826 ]])
'''
# if you look for values in the array being equal to some value, it returns a boolean array,
names == 'Soren' # result is array([ True, False, False, True, False, False, False])
# this boolean array can also be passed to other data, this is the same as looking at data[:2] to pull first two rows (two Trues).
# For this to work, the boolean array must be of the same length as the array it's indexing.
data[names == 'Soren']
data[:2]
'''
Results of both of the above are:
array([[-0.1211117 , -1.22088429, 0.10621492, -0.81788831],
[ 1.13184063, 0.63988033, -0.12147271, -0.55390123]])
'''
# to negate the condition, you can use != or ~
data[names != 'Soren']
data[~(names == 'Soren')]
'''
Results of both of the above are:
array([[-0.6880321 , -0.67625407, -0.38376663, -0.81110558],
[-1.03575407, 0.08275155, 1.42818308, 0.75963265],
[-0.74096649, -1.33766245, -0.71460405, 0.21090558],
[-0.6125883 , 1.57837986, -0.78419712, -1.28611674],
[ 1.24892667, -0.77049204, -0.53295673, -2.868826 ]])
'''
# boolean operators can also be combined using & (and) or | (or)
mask = (names == 'Soren') | (names == 'Will')
mask # result is array([ True, False, False, True, False, False, True])
mask2 = (names == 'Soren') & (names == 'Will')
mask2 # result is array([False, False, False, False, False, False, False])
####################################
### TRANSFORMING ARRAYS AND SWAPPING AXES ###
####################################
arr = np.arange(15).reshape((3, 5))
arr
'''
Result is:
array([[ 0, 1, 2, 3, 4],
[ 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14]])
'''
# the T method allows you to transpose the matrix
arr.T
'''
Result is:
array([[ 0, 5, 10],
[ 1, 6, 11],
[ 2, 7, 12],
[ 3, 8, 13],
[ 4, 9, 14]])
'''
# for higher dimensional arrays, transpose will accept a tuple of axis numbers to permute the axes
arr = np.arange(16).reshape((2, 2, 4))
'''
Result is:
array([[[ 0, 1, 2, 3],
[ 4, 5, 6, 7]],
[[ 8, 9, 10, 11],
[12, 13, 14, 15]]])
'''
# for higher dimensional arrays, transpose will accept a tuple of axis numbers to permute the axes.
# In this case, the axes will be reorded with the second axis first, the first axis second, and the last axis unchanged.
# The tuple is saying, list index [1], then [0], then [2].
arr.transpose((1, 0, 2))
'''
Result is:
array([[[ 0, 1, 2, 3],
[ 8, 9, 10, 11]],
[[ 4, 5, 6, 7],
[12, 13, 14, 15]]])
Prior to this, the result was:
array([[[ 0, 1, 2, 3],
[ 4, 5, 6, 7]],
[[ 8, 9, 10, 11],
[12, 13, 14, 15]]])
'''
# you can also use swapaxes, which takes a pair of axis numbers and switches the indicated axes to rearrange the data
arr = np.arange(16).reshape((2, 2, 4))
arr
'''
Result is:
array([[[ 0, 1, 2, 3],
[ 4, 5, 6, 7]],
[[ 8, 9, 10, 11],
[12, 13, 14, 15]]])
'''
arr.swapaxes(1,2)
'''
Result is:
array([[[ 0, 4],
[ 1, 5],
[ 2, 6],
[ 3, 7]],
[[ 8, 12],
[ 9, 13],
[10, 14],
[11, 15]]])
'''
####################################
### UNIVERSAL FUNCTIONS: FAST ELEMENT-WISE ARRAY FUNCTIONS ###
####################################
# A universal function or ufunc is a function that performs element-wise operations on data in ndarrays.
# The first type is a unary ufunc that make simple element-wise transformations:
arr = np.arange(10)
arr # result is array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
np.sqrt(arr)
'''
Result is:
array([0. , 1. , 1.41421356, 1.73205081, 2. ,
2.23606798, 2.44948974, 2.64575131, 2.82842712, 3. ])
'''
np.exp(arr)
'''
Result is:
array([1.00000000e+00, 2.71828183e+00, 7.38905610e+00, 2.00855369e+01,
5.45981500e+01, 1.48413159e+02, 4.03428793e+02, 1.09663316e+03,
2.98095799e+03, 8.10308393e+03])
'''
# The second type is a binary ufunc such as add or maximum that take two arrays and return a single array as a result:
first_array = np.random.randn(8)
second_array = np.random.randn(8)
first_array
''' Result is:
array([ 1.90604747, 0.27508265, 0.82439986, -0.19178404, -0.9814238 ,
0.9802765 , -1.2675382 , -0.94728439])
'''
second_array
''' Result is:
array([ 1.77877677, -0.35833601, -0.27337685, -0.17301413, -0.5580396 ,
0.74550886, 0.42002197, -1.24682839])
'''
# maximum which will return the maximum array
np.maximum(first_array, second_array)
''' Result is:
array([ 1.90604747, 0.27508265, 0.82439986, -0.17301413, -0.5580396 ,
0.9802765 , 0.42002197, -0.94728439])
'''
# add which will add the two arrays
np.add(first_array, second_array)
''' Result is:
array([ 3.68482425, -0.08325337, 0.55102301, -0.36479817, -1.53946339,
1.72578536, -0.84751623, -2.19411278])
'''
# ufuncs accept an option out argument that allows them to operate in-place on the arrays
myarray = np.random.randn(10)
myarray
''' Result is:
array([-0.57738876, -0.35990644, 1.06458676, -0.17363093, 1.14876476,
1.81621875, 0.50279201, 2.43921165, -0.52994033, -1.10970804])
'''
np.sqrt(myarray, myarray) # the second argument is the out argument
''' Result is:
array([ nan, nan, 1.03178814, nan, 1.07180444,
1.3476716 , 0.70907828, 1.56179757, nan, nan])
'''
####################################
### ARRAY-ORIENTED PROGRAMMING WITH ARRAYS ###
####################################
# numpy arrays allow you to express data processing tasks as array operations that might otherwise require writing loops.
# The practice of replacing explicit loops with array expressions is refered to as vecorization.
# Vectorization array operations will often be faster than their pure Python equivalents.
# As an example, say you want to evaluate the function sqrt(x^2 + y^2 across a grid of values. You could use the np.meshgrid
# function to take two 1D arrays and produce two 2D matrices corresponsing to all pairs of (x, y) in the two arrays.
import numpy as np
points = np.arange(-5, 5, 0.01) # 1000 equally spaced points
xs, ys = np.meshgrid(points, points)
ys
''' Result is:
array([[-5. , -5. , -5. , ..., -5. , -5. , -5. ],
[-4.99, -4.99, -4.99, ..., -4.99, -4.99, -4.99],
[-4.98, -4.98, -4.98, ..., -4.98, -4.98, -4.98],
...,
[ 4.97, 4.97, 4.97, ..., 4.97, 4.97, 4.97],
[ 4.98, 4.98, 4.98, ..., 4.98, 4.98, 4.98],
[ 4.99, 4.99, 4.99, ..., 4.99, 4.99, 4.99]])
'''
# Now you can evalutae the function in the same way as you would write it with two points:
z = np.sqrt(xs ** 2 + ys ** 2)
z
''' Result is:
array([[7.07106781, 7.06400028, 7.05693985, ..., 7.04988652, 7.05693985,
7.06400028],
[7.06400028, 7.05692568, 7.04985815, ..., 7.04279774, 7.04985815,
7.05692568],
[7.05693985, 7.04985815, 7.04278354, ..., 7.03571603, 7.04278354,
7.04985815],
...,
[7.04988652, 7.04279774, 7.03571603, ..., 7.0286414 , 7.03571603,
7.04279774],
[7.05693985, 7.04985815, 7.04278354, ..., 7.03571603, 7.04278354,
7.04985815],
[7.06400028, 7.05692568, 7.04985815, ..., 7.04279774, 7.04985815,
7.05692568]])
'''
# you can then visualize the array
import matplotlib.pyplot as plt
plt.imshow(z, cmap=plt.cm.gray); plt.colorbar()
plt.title("This is my plot of square roots or something")
plt.show()
####################################
### EXPRESSING CONDITIONAL LOGIC AS ARRAY OPERATIONS ###
####################################
# numpy.where is teh vectorized version of the ternary expression 'x if condition else y'. Say you have
# a boolean array and two arrays of values, and you want to take a value from first_array whenever cond is
# True, otherwise take the value from second_array
first_array = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
second_array = np.array([2.1, 2.2, 2.3, 2.4, 2.5])
cond = np.array([True, False, True, True, False])
# you could do this with a list comprehension, but it wouldn't be very fast, and it wouldn't work on
# multi-dimensional arrays
result = [(x if c else y)
for x, y, c in zip(first_array, second_array, cond)]
result # result is [1.1, 2.2, 1.3, 1.4, 2.5]
# that said, you could do this more quickly with np.where
np.where(cond, first_array, second_array) # check conditional statement (in this case boolean values contained in the cond array), and if True then select from first_array, else select from second_array
result #result is [1.1, 2.2, 1.3, 1.4, 2.5]
# another example is to take an array and replace the values based on whether it is positive or negative.
# In this example, the second and third arguments are not arrays as in the previous example, but are instead scalar values.
myarray = np.random.randn(4,4)
myarray
''' Result is:
array([[-0.9417623 , -0.22975052, 1.35398507, -0.95858373],
[-0.38293323, -0.7663218 , -0.14112195, -0.28233403],
[ 1.25456155, -0.20698571, 3.32226647, -0.64237404],
[-0.32100475, -0.06935425, -0.59368462, -0.20792726]])
'''
# Now say you want to evaluate each position and see if it is positive or negative
myarray > 0
''' Result is:
array([[False, False, True, False],
[False, False, False, False],
[ True, False, True, False],
[False, False, False, False]])
'''
# You could then use that conditional statement in np.where to replace the values
np.where(myarray > 0, 2, -2) # if conditional statement is true for value in myarray, replace with 2, else replace with -2
''' Result is:
array([[-2, -2, 2, -2],
[-2, -2, -2, -2],
[ 2, -2, 2, -2],
[-2, -2, -2, -2]])
'''
####################################
### MATHEMATICAL AND STATISTICAL METHODS ###
####################################
arr = np.array ([[1,4,3,6,3],
[4,6,4,9,2],
[8,3,1,6,3],
[5,4,4,5,2],
[7,5,4,6,6]],
)
# you can get aggregate stats of the members of the array
arr.mean() # result is 4.44, mean of all elements
np.mean(arr) # result is 4.44
arr.sum() # 111, sum of all elements
# there is an optional axis argument that computes the statistic over the given access, resulting in
# an array with one fewer dimension. In this case, it'll get the mean of each second dimension member of the array.
# For example, arr.mean(1) means 'compute mean across the columns)
arr.sum(axis = 1) # result is array([17, 25, 21, 20, 28]), sum of all sub elements
# arr.sum(0) takes the sum of the first member of each sub member (in this case, 1, 4, 8, 5, 7)
arr.sum(axis = 0) # result is array([-4.13822559, 0.9767256 , -1.32725686, -1.02620234])
arr.cumsum()
''' Result is the cumulative sum of all elements:
array([ 1, 5, 8, 14, 17, 21, 27, 31, 40, 42, 50, 53, 54,
60, 63, 68, 72, 76, 81, 83, 90, 95, 99, 105, 111],
dtype=int32)
'''
# with axis, it'll return the same number of dimensions, but partion by the sub elements
arr.cumsum(axis = 1)
''' Result is the cumulative sum of all elements:
array([[ 1, 5, 8, 14, 17],
[ 4, 10, 14, 23, 25],
[ 8, 11, 12, 18, 21],
[ 5, 9, 13, 18, 20],
[ 7, 12, 16, 22, 28]], dtype=int32)
'''
arr.cumprod()
''' Result is the cumulative sum of all elements:
array([ 1, 4, 12, 72, 216,
864, 5184, 20736, 186624, 373248,
2985984, 8957952, 8957952, 53747712, 161243136,
806215680, -1070104576, 14548992, 72744960, 145489920,
1018429440, 797179904, -1106247680, 1952448512, -1170210816],
dtype=int32)
'''
arr.cumprod(axis = 1)
''' Result is the cumulative sum of all elements:
array([[ 1, 4, 12, 72, 216],
[ 4, 24, 96, 864, 1728],
[ 8, 24, 24, 144, 432],
[ 5, 20, 80, 400, 800],
[ 7, 35, 140, 840, 5040]], dtype=int32)
'''
####################################
### BOOLEAN ARRAY METHODS ###
####################################
# boolean values are coerced to 1 (True) and 0 (False). Therefore, sum is often used as a means of
# counting True values in a boolean aray.
arr = np.random.randn(100)
(arr > 0).sum() # result is the number of positive values in the array, or 54 in this case
# any tests whether one or more values in an array is True, while all tests if they're all True
boolean_array = np.array([False, False, True, False])
boolean_array.any() # result is True
boolean_array.all() # result is False
####################################
### SORTING AN ARRAY ###
####################################
arr = np.random.randn(6)
arr # result is array([-1.29540316, 0.920679 , -0.27233344, 0.26339028, -2.00256255, 2.36187669])
arr.sort() # sorts the array
arr # result is array([-2.00256255, -1.29540316, -0.27233344, 0.26339028, 0.920679, 2.36187669])
arr = np.random.randn(5,3)
arr
''' Result is:
array([[-0.23198357, 0.88090255, -2.52562337],
[ 0.15980551, -0.39624047, -0.58295457],
[-0.06606336, 1.39889203, 0.86527283],
[ 0.84156916, -0.36808526, -0.72742063],
[ 0.07780976, -1.32896086, -1.21030049]])
'''
arr.sort(1) # sort the subelements (or second dimension elements)
arr
''' Result is:
array([[-2.52562337, -0.23198357, 0.88090255],
[-0.58295457, -0.39624047, 0.15980551],
[-0.06606336, 0.86527283, 1.39889203],
[-0.72742063, -0.36808526, 0.84156916],
[-1.32896086, -1.21030049, 0.07780976]])
'''
arr.sort(0) # sort the main elements (or 1st dimension elements
arr
''' Result is:
array([[-2.52562337, -1.21030049, 0.07780976],
[-1.32896086, -0.39624047, 0.15980551],
[-0.72742063, -0.36808526, 0.84156916],
[-0.58295457, -0.23198357, 0.88090255],
[-0.06606336, 0.86527283, 1.39889203]])
'''
####################################
### SET LOGIC ###
####################################
# getting unique values in an array
names = np.array(['Soren', 'Liv', 'Will', 'Soren', 'James'])
sorted(set(names)) # pure Python, returns unique names using set()
np.unique(names) # NumPy alternative, result is array(['James', 'Liv', 'Soren', 'Will'], dtype='<U5')
# testing if a value exists in an array, in this case searching for 2, 3 or 6 and returning a boolean array
values = np.array([6, 0, 0, 3, 2, 5, 6])
np.in1d(values, [2, 3, 6]) # list of 2, 3, or 6 is what we're searching for, result is array([ True, False, False, True, True, False, True])
####################################
### PSEUDORANDOM NUMBER GENERATION ###
####################################
# numpy.random supplements the built-in Python random with functions for efficiently generating whoe arrays of sample
# values from many kinds of probability distributions. For example, you can get a 4 x 4 array of samples from the
# standard normal distribution using normal
samples = np.random.normal(size=(4,4))
samples
''' Result is:
array([[ 1.75657806, 1.85279794, 0.13050136, -0.23823481],
[ 0.40257826, 0.90579335, 1.0795258 , -0.41758298],
[ 0.18967166, -1.14146864, -0.39585093, -0.28985462],
[-1.6696204 , 1.31926929, 0.42371635, -0.30972391]])
'''
# the numbers are pseudorandom since they're generated by an algorithm with deterministic behavior
# based on the seed of the random number generatore. You can change the NumPy seed:
np.random.seed(1234)
####################################
### WORKING EXAMPLE - RANDOM WALKS ###
####################################
# this is the way to create a random walk plot of values incrementing up by 1 or down by -1 randomly
import random
import matplotlib.pyplot as plt
position = 0
walk = [position]
steps = 1000
for i in range(steps):
step = 1 if random.randint(0,1) else -1
position += step
walk.append(position)
plt.plot(walk[:100]) # create the plot
plt.show() # show the plot
# however, you could also assume that the walk is simply the cumulative sum of the random steps
# and could be evaluated as an array expression which will give you the same result
nsteps = 1000
draws = np.random.randint(0,2,size = nsteps)
steps = np.where(draws > 0, 1, -1)
walk = steps.cumsum()
plt.plot(walk)
plt.show()