In [1]:
import gensim
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile
import numpy as np
In [3]:
# Function for loading the glove word embedding matrix values
def load_glove_vectors(glove_file):
    with open(glove_file, 'r', encoding="utf-8") as file:
        # unique words
        words = set()
        word_to_vec = {}
        # each line starts with a word then the values for the different features
        for line in file:
            line = line.strip().split()
            # take the word 
            curr_word = line[0]
            words.add(curr_word)
            # rest of the features for the word
            word_to_vec[curr_word] = np.array(line[1:], dtype=np.float64)
            
    return words, word_to_vec
In [5]:
words, word_to_vec = load_glove_vectors("glove.6B.50d.txt")

Cosine Similarity¶

We will be using cosine similarity for finding the suitable word. We will $e_b - e_a$ and $e_d - e_c$ as the two vectors to find their cosine, where $e_d$ is searched from all the other words in the vocabulary.

Given two vectors $u$ and $v$, cosine similarity is defined as follows:

$\text{Cosine Similarity} (u, v) = \frac{u.v}{||u||_2 ||v||_2} = cos(\theta)$

where $u.v$ is the dot product of two vectors, $||u||_2$ is the norm of the vector $u$, and $\theta$ is the angle between $u$ and $v$.

This similarity depends on the angle between $u$ and $v$.

If $u$ and $v$ are very similar, their cosine similarity will be close to $1$

If they are dissimilar, the cosine similarity will take a smaller value.

In [8]:
# finds the cosine similarity between u and v
'''
    Arguments:
        u(n,) - vector of words            
        v(n,) - vector of words 
    Returns:
        cosine_sim - the cosine similarity between u and v
'''
def find_cosine_similarity(u, v):
    distance = 0.0
    
    # find the dot product between u and v 
    dot = np.dot(u,v)
    # find the L2 norm of u 
    norm_u = np.sqrt(np.sum(u**2))
    # Compute the L2 norm of v
    norm_v = np.sqrt(np.sum(v**2))
    # Compute the cosine similarity
    cosine_sim = dot/(norm_u)/norm_v
    
    return cosine_sim
In [10]:
# sample words
father = word_to_vec["father"]
mother = word_to_vec["mother"]
king = word_to_vec["king"]
queen = word_to_vec["queen"]
bat = word_to_vec["bat"]
crow = word_to_vec["crow"]
india = word_to_vec["india"]
italy = word_to_vec["italy"]
delhi = word_to_vec["delhi"]
rome = word_to_vec["rome"]
love = word_to_vec["love"]
like = word_to_vec["like"]
hate = word_to_vec["hate"]


print("cosine_similarity(king, queen) = ", find_cosine_similarity(king, queen))
print("cosine_similarity(father, mother) = ", find_cosine_similarity(father, mother))
print("cosine_similarity(king - queen, father - mother) = ",find_cosine_similarity(king - queen, father - mother))
print("cosine_similarity(bat, crow) = ",find_cosine_similarity(bat, crow))
print("cosine_similarity(india - delhi, rome - italy) = ",find_cosine_similarity(india - delhi, rome - italy))
print("cosine_similarity(love, like) = ", find_cosine_similarity(love, like))
cosine_similarity(king, queen) =  0.7839043010964117
cosine_similarity(father, mother) =  0.8909038442893615
cosine_similarity(king - queen, father - mother) =  0.661889473579435
cosine_similarity(bat, crow) =  0.41574518317394416
cosine_similarity(india - delhi, rome - italy) =  -0.6363974204130605
cosine_similarity(love, like) =  0.7682945294633257
In [12]:
# Word analogy task: a is to b as c is to ____
def find_analogy(word_a, word_b, word_c, word_to_vec):
    # convert words to lower case
    word_a = word_a.lower()
    word_b = word_b.lower()
    word_c = word_c.lower()
    
    
    # find the word embeddings for word_a, word_b, word_c
    e_a, e_b, e_c = word_to_vec[word_a], word_to_vec[word_b], word_to_vec[word_c]
    
    words = word_to_vec.keys()
    max_cosine_sim = -999              
    best_word = None                  

    # search for word_d in the whole word vector set
    for w in words:        
        # ignore input words
        if w in [word_a, word_b, word_c] :
            continue

        # Compute cosine similarity between the vectors u and v
        #u:(e_b - e_a) 
        #v:((w's vector representation) - e_c)
        cosine_sim = find_cosine_similarity(e_b - e_a, word_to_vec[w] - e_c)
        
        if cosine_sim > max_cosine_sim:
            max_cosine_sim = cosine_sim
            # update word_d
            best_word = w
        
    return best_word
In [14]:
examples = [('france', 'paris', 'japan'), ('tall', 'taller', 'small'), ('morning', 'breakfast', 'evening')]
for example in examples:
    print ('{} -> {} :: {} -> {}'.format( *example, find_analogy(*example, word_to_vec)))
france -> paris :: japan -> tokyo
tall -> taller :: small -> outnumber
morning -> breakfast :: evening -> dinners
In [16]:
# for taking input from the user and doing word analogy task on that
def take_input():
    print('a --> b :: c --> d')
    print('Enter a, b, c words separated by space')
    words = input().split(' ')
    
    best_pick = find_analogy(*words, word_to_vec)
    print ('{} -> {} :: {} -> {}'.format( *words, best_pick))
    print('Best pick: ' + best_pick)
In [18]:
take_input()
a --> b :: c --> d
Enter a, b, c words separated by space
king -> queen :: boy -> girl
Best pick: girl
In [ ]: