Exploring Position Embeddings: Similarity and Shuffling
Investigating the statistical properties of GPT-2's position embeddings by comparing actual similarity distributions against random shuffles.
🚀 The Core Concept
How much of a transformer's position awareness is "real" structure versus random noise? In this exploration, we'll dive deep into the Weight Position Embedding (WPE) matrix of GPT-2. By calculating cosine similarities across different dimensions and comparing them to a randomly shuffled version of the same data, we can visualize the inherent organization (or lack thereof) in learned positional representations.
This content is adapted from A deep understanding of AI language model mechanisms. It has been curated and organized for educational purposes on this portfolio. No copyright infringement is intended.
1. Setup and Matrix Extraction
We'll start by loading the GPT-2 model and extracting its position embedding matrix gpt2.wpe.weight.
import numpy as np
import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')
from transformers import GPT2Model
# get the Word Position Embeddings matrix
gpt2 = GPT2Model.from_pretrained('gpt2')
positions = gpt2.wpe.weight.detach().numpy()/Users/drippy/.pyenv/versions/3.12.6/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm2. Exercise 1: Multi-Dimensional Similarity
We'll calculate cosine similarities in two ways:
- Token Index Similarity: How similar are different positions?
- Embedding Dimension Similarity: How correlated are different embedding channels?
# copied from "embed_positionEmbeddings.ipynb"
# cosine similarities for "time series" (token index)
Pnorm1 = positions / np.linalg.norm(positions,axis=1,keepdims=True)
cossim_tokens = Pnorm1 @ Pnorm1.T
# cosine similarities across embedding dimensions
Pnorm0 = positions / np.linalg.norm(positions,axis=0,keepdims=True)
cossim_embeds = Pnorm0.T @ Pnorm0# draw the images (also copied from previous code file)
fig,axs = plt.subplots(1,2,figsize=(12,5))
h = axs[0].imshow(cossim_tokens,vmin=-1,vmax=1)
axs[0].set(xlabel='Token index ("time")',ylabel='Token index ("time")',title='$S_c$ over "time"')
ch = fig.colorbar(h,ax=axs[0],pad=.02,fraction=.046)
ch.ax.tick_params(labelsize=10)
ch.ax.set_yticks(np.arange(-1,1.1,.5))
h = axs[1].imshow(cossim_embeds,vmin=-1,vmax=1)
axs[1].set(xlabel='Embedding index',ylabel='Embedding index',title='$S_c$ across embeddings')
ch = fig.colorbar(h,ax=axs[1],pad=.02,fraction=.046)
ch.ax.tick_params(labelsize=10)
ch.ax.set_yticks(np.arange(-1,1.1,.5))
plt.tight_layout()
plt.show()3. Distribution Visualization
Now, let's look at the histograms of these similarities:
# small demo about triu:
A = np.random.randint(0,9,(4,4))
print(A)
print('')
A[np.nonzero(np.triu(A,1))][[3 8 7 6]
[3 7 2 7]
[0 7 5 6]
[4 6 1 3]]array([8, 7, 6, 2, 7, 6])# get the unique cosine similarity values from the upper-triangle
unique_cs_embeds = cossim_embeds[np.nonzero(np.triu(cossim_embeds,1))] # note the ",1" to avoid the trivial diagonal
unique_cs_tokens = cossim_tokens[np.nonzero(np.triu(cossim_tokens,1))]
# get their distributions
embed_hy,embed_hx = np.histogram(unique_cs_embeds,100)
token_hy,token_hx = np.histogram(unique_cs_tokens,100)
# visualize!
plt.figure(figsize=(12,4))
plt.bar(embed_hx[:-1],embed_hy,width=np.diff(embed_hx[:2]),alpha=.4,label='$S_c$ across embeddings')
plt.bar(token_hx[:-1],token_hy,width=np.diff(token_hx[:2]),alpha=.4,label='$S_c$ across "time"')
plt.plot(embed_hx[:-1],embed_hy)
plt.plot(token_hx[:-1],token_hy)
plt.legend()
plt.gca().set(xlim=[-1,1],xlabel='Cosine similarity',ylabel='Count',title='Distributions of $S_c$ in the positional embeddings matrix')
plt.show()4. Exercise 2: Shuffling and Baseline Comparison
To prove that the structure isn't just a byproduct of the numbers' distribution, we'll randomly shuffle the entire matrix and see how that affects the similarities.
# vectorize and copy the positions
randomEmbeds = positions.flatten()
# randomly shuffle them
np.random.shuffle(randomEmbeds)
# reshape back to the matrix
randomEmbeds = randomEmbeds.reshape(positions.shape)_,axs = plt.subplots(2,1,figsize=(8,7))
axs[0].imshow(positions.T,aspect='auto',vmin=-.1,vmax=.1)
axs[0].set(xlabel='Token position',ylabel='Dimensions',title='GPT-2 position embeddings matrix')
axs[1].imshow(randomEmbeds.T,aspect='auto',vmin=-.1,vmax=.1)
axs[1].set(xlabel='Token position',ylabel='Dimensions',title='Shuffled embeddings matrix')
plt.tight_layout()
plt.show()# calculate cosine similarity
Rnorm0 = randomEmbeds / np.linalg.norm(randomEmbeds,axis=0,keepdims=True)
cossim_random = Rnorm0.T @ Rnorm0# get the unique cosine similarity values from the upper-triangle
unique_cs_random = cossim_random[np.nonzero(np.triu(cossim_random,1))]
# get their distribution
random_hy,random_hx = np.histogram(unique_cs_random,100)
# visualize!
plt.figure(figsize=(12,4))
plt.bar(embed_hx[:-1],embed_hy,width=np.diff(embed_hx[:2]),alpha=.4,label='$S_c$ across embeddings')
plt.bar(random_hx[:-1],random_hy,width=np.diff(random_hx[:2]),alpha=.4,label='$S_c$ in shuffled vectors')
plt.plot(embed_hx[:-1],embed_hy)
plt.plot(random_hx[:-1],random_hy)
plt.legend()
plt.gca().set(xlim=[-1,1],xlabel='Cosine similarity',ylabel='Count',title='Distributions of $S_c$ in the positional embeddings matrix')
plt.show()5. Exercise 3: Finding Similar Pairs
Finally, we identify the most similar pairs of embedding dimensions and visualize them. This helps us see if the dimensions are simply copies of each other or have unique roles.
# reminder: positions matrix is size [index,embedding]
sortidx = np.argsort(np.triu(cossim_embeds,1).flatten())[::-1]
xx,yy = np.unravel_index(sortidx,cossim_embeds.shape)
plt.figure(figsize=(10,8))
for i in np.linspace(0,200,10).astype(int):
# get and print the pairs
pairname = f'({xx[i]},{yy[i]})'
print(f'Cossim of {cossim_embeds[xx[i],yy[i]]:.3f} in pair {pairname}')
# plot them
plt.plot(positions[:,xx[i]],positions[:,yy[i]],'.-',alpha=.5,label=pairname)
# adjustments
plt.gca().set(xlabel='Embedding dimension "x"',ylabel='Embedding dimension "y"')
plt.legend(fontsize=9, bbox_to_anchor=(1.05, 1), loc='upper left')
plt.title('Top 10 Most Similar Embedding Dimensions')
plt.tight_layout()
plt.show()Cossim of 0.992 in pair (356,528)
Cossim of 0.943 in pair (320,540)
Cossim of 0.915 in pair (308,528)
Cossim of 0.885 in pair (107,271)
Cossim of 0.864 in pair (365,525)