Symbolic computing for ML#
!wget --no-cache -O init.py -q https://raw.githubusercontent.com/jdariasl/OTBD/main/content/init.py
import init; init.init(force_download=False)
import sys
import sympy as sy
from IPython.display import Image
%load_ext tensorboard
sy.init_printing(use_latex=True)
import tensorflow as tf
tf.__version__
2024-12-10 17:19:50.860339: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
'2.14.0'
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.optimize import minimize
%matplotlib inline
d = pd.read_csv("local/datasets/trilotropicos.csv")
print(d.shape)
plt.scatter(d.longitud, d.densidad_escamas)
plt.xlabel(d.columns[0])
plt.ylabel(d.columns[1]);
(150, 2)
data:image/s3,"s3://crabby-images/46c95/46c950111d71cd1058286f34fad2aa1d6f39014a" alt="../_images/2e8b795093074d54cc1c3d282f240d392d701404fc7af910dc46ef1cdbda11cf.png"
Let’s define a simple loss function#
The gradient of \(\mathcal{L}(\cdot)\) is:
y = d.densidad_escamas.values
X = np.r_[[[1]*len(d), d.longitud.values]].T
def n_cost(t):
return np.mean((X.dot(t)-y)**2)
def n_grad(t):
return 2*X.T.dot(X.dot(t)-y)/len(X)
init_t = np.random.random()*40-5, np.random.random()*20-10
r = minimize(n_cost, init_t, method="BFGS", jac=n_grad)
r
fun: 2.7447662570809355
hess_inv: array([[ 5.49985723, -1.07531773],
[-1.07531773, 0.23126819]])
jac: array([-2.44934669e-08, 2.52831629e-06])
message: 'Optimization terminated successfully.'
nfev: 10
nit: 9
njev: 10
status: 0
success: True
x: array([12.68999516, -0.71805846])
Using sympy
computer algebra system (CAS)#
x,y = sy.symbols("x y")
z = x**2 + x*sy.cos(y)
z
data:image/s3,"s3://crabby-images/5046f/5046fab7fd2cbc038e7f2bf93cb8191f6d7b1c09" alt="../_images/5fdf2e2d61a6176a962d7a0929cb547d2afa2258dbb2ce1237283221e5c90839.png"
we can evaluate the expresion by providing concrete values for the symbolic variables
z.subs({x: 2, y: sy.pi/4})
data:image/s3,"s3://crabby-images/6c48e/6c48ec44c11b220b323f3560078e951a6db38811" alt="../_images/a0e1ccc3fe257c1c744955804d99625eae3051c7b086a32819c9b766ac3b067d.png"
z.subs({x: 2})
data:image/s3,"s3://crabby-images/5e725/5e7251735158b70810ddbd1eb147de411d947cc5" alt="../_images/887fa233aedee453d4832c2b01e457919f3d7cb1ff5da770aaa5902cff1062c7.png"
and obtain numerical approximations of these values
sy.N(z.subs({x: 2, y: sy.pi/4}))
data:image/s3,"s3://crabby-images/c6fcd/c6fcd8a4e93a4176a612b49b3967a629dacb4ee9" alt="../_images/6012faf1091d3fa7bcb34fd595d2cedbda446268e8f8f45ef9e1fd8d87790d85.png"
a derivative can be seen as a function that inputs and expression and outputs another expression
observe how we compute \(\frac{\partial z}{\partial x}\) and \(\frac{\partial z}{\partial y}\)
z.diff(x)
data:image/s3,"s3://crabby-images/01387/01387a6d3bc6a2de0597be549c55d5e2c03c31b8" alt="../_images/13f410ad079c371cc362b16ed611683530351281729420413386ef4163315f98.png"
z.diff(y)
data:image/s3,"s3://crabby-images/aef07/aef07315e53696792e3fd3fbdd78b41852c20895" alt="../_images/0e4849370e205c6fec8054b18180cb11a792563a08ba23451b79c61251c344f3.png"
r = z.diff(x).subs({x: 2, y: sy.pi/4})
r, sy.N(r)
data:image/s3,"s3://crabby-images/84be1/84be1a52001fe64d645413ddd7a0201ef29effc6" alt="../_images/5b66f3dfdc47412f6281b381612fa53de52b02e75d29c2465512cbd973bf22e5.png"
r = z.diff(y).subs({x: 2, y: sy.pi/4})
r, sy.N(r)
data:image/s3,"s3://crabby-images/bd54f/bd54facfbaf74724c1af05c92911f71dbed796d7" alt="../_images/cf9d68a9de41719dd498ce421ec868fc88d0654e7ea7abec04400b20cd92c7a7.png"
How is this done?
Computational graph#
Image("local/imgs/ComputationalGraph.png")
data:image/s3,"s3://crabby-images/f7a48/f7a48f371699d4cf558e01c8d5f7d42b8dc90336" alt="../_images/e3e4d6e9db8aeafe68ed2d5589f945120c3351514dfc474503c581fdec10488f.png"
Image("local/imgs/ComputationalGraph2.png")
data:image/s3,"s3://crabby-images/00554/0055439e6bfe754c7f050bd41136836258905e22" alt="../_images/24b7b85fdfb937f21fdaed0addde605174ebf20fe5ee8af8b920ac29ba256c81.png"
A symbolic computational package can do a lot of things:#
sy.expand((x+2)**2)
data:image/s3,"s3://crabby-images/8f92d/8f92d41d6352dbc22d466335e69b4c0826e33916" alt="../_images/e8f25e278627484b698de233a808a69e796f0ceefa0fa4de712f851a77703158.png"
sy.factor( x**2-2*x-8 )
data:image/s3,"s3://crabby-images/c72fd/c72fde0041fca0bf0ccd75222398b6bf38ce4a7b" alt="../_images/ebe5aff5caf3c978509526860fc9283af939ac378d4c58954d295e1d0057f5f5.png"
sy.solve( x**2 + 2*x - 8, x)
data:image/s3,"s3://crabby-images/931e6/931e66386c6f8dee53a0baecc76bdedf47f4ecd0" alt="../_images/33fbb80bb61851bb104eba43a5a7aad6835fd0d10a4373695398730ab1b85950.png"
a = sy.symbols("alpha")
sy.solve( a*x**2 + 2*x - 8, x)
data:image/s3,"s3://crabby-images/db409/db409bfdae7aaaf100f58548365c2700e2c4c573" alt="../_images/00fb16a38ae1cab5d5a19c76d4bd9ff70d56ec84327c2951a7cc1d936df70eae.png"
differential equations, solving \(\frac{df}{dt}=f(t)+t\)
t, C1 = sy.symbols("t C1")
f = sy.symbols("f", cls=sy.Function)
dydt = f(t)+t
eq = dydt-sy.diff(f(t),t)
eq
data:image/s3,"s3://crabby-images/18dfc/18dfc448996d3ea59b3bc54bea50823d74f794c3" alt="../_images/a5466e40fe1fca4a6c143bb68a4669657fa3774f2173c87a32d19a6ecee58426.png"
yt = sy.dsolve(eq, f(t))
yt
data:image/s3,"s3://crabby-images/15ca7/15ca7d60a602ab8c19a45a10b269040f8775fa31" alt="../_images/ffa2dea42f42249eb7f556b74572bf0c24bb592426cd71fac03c22a089ebf241.png"
systems of equations
sy.solve ([x**2+y, 3*y-x])
data:image/s3,"s3://crabby-images/325e9/325e9cccd1cf796d2eb980eba70bc13413efb0ef" alt="../_images/62860aab5ffa76783084a9402eaa557ec48b0ab32d5f3255cf86fda99e6bc168.png"
Sympy
to Python
and Numpy
#
f = (sy.sin(x) + x**2)/2
f
data:image/s3,"s3://crabby-images/ae7ee/ae7eebc523112f05e7c6628ccc419940ec7dbde0" alt="../_images/60afe6cfdd5a9c2bc149f4b6b54cf7bb24d4ecb559ffb4393870c32df13fbae9.png"
f.subs({x:10})
data:image/s3,"s3://crabby-images/10567/10567b56b800a428557358641b04abf8b411a3ce" alt="../_images/669a60a56adf7467f5ba052b9dd062d89604b36a225bf4927992418b84b20802.png"
sy.N(f.subs({x:10}))
data:image/s3,"s3://crabby-images/054eb/054eb0e2d386ecf53922a83b1bce4b0f3110cbc7" alt="../_images/ac544c0385a8bdac19abdab44646506c5123b2d79954f6113c78cc3aae273692.png"
f1 = sy.lambdify(x, f)
f1(10)
data:image/s3,"s3://crabby-images/054eb/054eb0e2d386ecf53922a83b1bce4b0f3110cbc7" alt="../_images/ac544c0385a8bdac19abdab44646506c5123b2d79954f6113c78cc3aae273692.png"
and a vectorized version
f2 = sy.lambdify(x, f, "numpy")
f2(10)
data:image/s3,"s3://crabby-images/054eb/054eb0e2d386ecf53922a83b1bce4b0f3110cbc7" alt="../_images/ac544c0385a8bdac19abdab44646506c5123b2d79954f6113c78cc3aae273692.png"
f2(np.array([10,2,3]))
array([49.72798944, 2.45464871, 4.57056 ])
the lambdified version is faster, and the vectorized one is even faster
%timeit sy.N(f.subs({x:10}))
124 µs ± 3.06 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
%timeit f1(10)
1.28 µs ± 66.1 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
%timeit [f1(i) for i in range(1000)]
1.37 ms ± 20.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
%timeit f2(np.arange(1000))
15.8 µs ± 471 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
Let’s remember the loss function we defined before:
The gradient of \(\mathcal{L}(\cdot)\) is:
Using sympy
to obtain the gradient.#
y = d.densidad_escamas.values
X = np.r_[[[1]*len(d), d.longitud.values]].T
w0,w1 = sy.symbols("w_0 w_1")
w0,w1
data:image/s3,"s3://crabby-images/9df17/9df177b3e8bf99b191a8c65630054803376afff4" alt="../_images/56c56dc6687ed4631014d8dbb5cd02e0e665abc24a868e11e96559c723269a4d.png"
we first obtain the cost expression for a few summation terms, so that we can print it out and understand it
L = 0
for i in range(10):
L += (X[i,0]*w0+X[i,1]*w1-y[i])**2
L = L/len(X)
L
data:image/s3,"s3://crabby-images/631a6/631a6a674863a2dd747d1fb01133aafef746234b" alt="../_images/a92cd4e1e1358c705a45941201370ab8a2e1d0e2082646c237f26a0e647a6e7c.png"
we can now simplify the expression, using sympy
mechanics
L = L.simplify()
L
data:image/s3,"s3://crabby-images/12368/1236808a6a076cab583e7301377da5023278c876" alt="../_images/419e7c2b3c2a378c25746a4c670ebeae36f14aceddd9f4e5f8fdf84a5dac16a9.png"
we now build the full expression
def build_regression_cost_expression(X,y):
expr_cost = 0
for i in range(len(X)):
expr_cost += (X[i,0]*w0+X[i,1]*w1-y[i])**2/len(X)
expr_cost = expr_cost.simplify()
return expr_cost
y = d.densidad_escamas.values
X = np.r_[[[1]*len(d), d.longitud.values]].T
expr_cost = build_regression_cost_expression(X,y)
expr_cost
data:image/s3,"s3://crabby-images/f022d/f022d3f4cc50897bb4d11324ffd4317c9a51277c" alt="../_images/3adb568896a9fc7ca785167cc679ea56e846b0fd342b4ce71431d45620576b98.png"
obtain derivatives symbolically
expr_dw0 = expr_cost.diff(w0)
expr_dw1 = expr_cost.diff(w1)
expr_dw0, expr_dw1
data:image/s3,"s3://crabby-images/25cfd/25cfd186bb5e15e433ba808d3b31939935f4f0dd" alt="../_images/0e42b242a85c77665b436a3d65355e43ba46cf3339914322decd3976c3992839.png"
and obtain regular Python so that we can use them in optimization
s_cost = sy.lambdify([[w0,w1]], expr_cost, "numpy")
d0 = sy.lambdify([[w0,w1]], expr_dw0, "numpy")
d1 = sy.lambdify([[w0,w1]], expr_dw1, "numpy")
s_grad = lambda x: np.array([d0(x), d1(x)])
and now we can minimize
r = minimize(s_cost, [0,0], jac=s_grad, method="BFGS")
r
fun: 2.7447662570799594
hess_inv: array([[ 5.57425906, -1.09086733],
[-1.09086733, 0.23434848]])
jac: array([-1.73778293e-07, -9.26928138e-07])
message: 'Optimization terminated successfully.'
nfev: 10
nit: 9
njev: 10
status: 0
success: True
x: array([12.6899981, -0.7180591])
observe that hand derived functions and the ones obtained by sympy
evaluate to the same values
w0 = np.random.random()*5+10
w1 = np.random.random()*4-3
w = np.r_[w0,w1]
print ("theta:",w)
print ("cost analytic:", n_cost(w))
print ("cost symbolic:", s_cost(w))
print ("gradient analytic:", n_grad(w))
print ("gradient symbolic:", s_grad(w))
theta: [10.98668483 0.37723461]
cost analytic: 16.790694513357096
cost symbolic: 16.790694513357067
gradient analytic: [ 6.77949907 36.19071996]
gradient symbolic: [ 6.77949907 36.19071996]
Expression swell
Multidimensional vector-valued functions require additional contructs beyond algebraic expressions:
Gradient
Jacobian
Hessian
Jacobian vector products (jvp): \(J_g(\theta){\bf v}\)
Vector jacobian products (vjp): \({\bf u}^\top J_g(\theta)\)
The partial derivatives are mainly built on top of jvp and vjp operators. Note that \(\frac{\partial g}{\partial \theta_i}\) is just the \(i\)-column vector of the Jacobian, or the jvp when the vector \({\bf v} = [0,\cdots,i,\cdots,0]\) where \(i=1\), or, even more efficiently using vjp.