build_vocabulary.py

from PIL import Image
import numpy as np
from cyvlfeat.sift.dsift import dsift
from  cyvlfeat.kmeans import kmeans
from time import time
from tqdm import tqdm

#This function will sample SIFT descriptors from the training images,
#cluster them with kmeans, and then return the cluster centers.

def build_vocabulary(image_paths, vocab_size):
    ##################################################################################
    # TODO:                                                                          #
    # Load images from the training set. To save computation time, you don't         #
    # necessarily need to sample from all images, although it would be better        #
    # to do so. You can randomly sample the descriptors from each image to save      #
    # memory and speed up the clustering. Or you can simply call vl_dsift with       #
    # a large step size here.                                                        #
    #                                                                                #
    # For each loaded image, get some SIFT features. You don't have to get as        #
    # many SIFT features as you will in get_bags_of_sift.py, because you're only     #
    # trying to get a representative sample here.                                    #
    #                                                                                #
    # Once you have tens of thousands of SIFT features from many training            #
    # images, cluster them with kmeans. The resulting centroids are now your         #
    # visual word vocabulary.                                                        #
    ##################################################################################
    ##################################################################################
    # NOTE: Some useful functions                                                    #
    # This function will sample SIFT descriptors from the training images,           #
    # cluster them with kmeans, and then return the cluster centers.                 #
    #                                                                                #
    # Function : dsift()                                                             #
    # SIFT_features is a N x 128 matrix of SIFT features                             #
    # There are step, bin size, and smoothing parameters you can                     #
    # manipulate for dsift(). We recommend debugging with the 'fast'                 #
    # parameter. This approximate version of SIFT is about 20 times faster to        #
    # compute. Also, be sure not to use the default value of step size. It will      #
    # be very slow and you'll see relatively little performance gain from            #
    # extremely dense sampling. You are welcome to use your own SIFT feature.        #
    #                                                                                #
    # Function : kmeans(X, K)                                                        #
    # X is a M x d matrix of sampled SIFT features, where M is the number of         #
    # features sampled. M should be pretty large!                                    #
    # K is the number of clusters desired (vocab_size)                               #
    # centers is a d x K matrix of cluster centroids.                                #
    #                                                                                #
    # NOTE:                                                                          #
    #   e.g. 1. dsift(img, step=[?,?], fast=True)                                    #
    #        2. kmeans( ? , vocab_size)                                              #  
    #                                                                                #
    # ################################################################################
    '''
    Input : 
        image_paths : a list of training image path
        vocal size : number of clusters desired
    Output :
        Clusters centers of Kmeans
    '''
    features = []
    for path in tqdm(image_paths):
        img = np.asarray(Image.open(path),dtype='float32')
        # the function dsift return a descrptor with size (F, 128), where F is the number of keypoints (frames) used. The 128 length vectors per keypoint extracted.
        _, descriptor = dsift(img, step=[3,3], fast=True)
        features.append(descriptor)
    # concatenate all features to be 2D matrix of shape (1500*F, 128)
    features = np.concatenate(features, axis=0).astype('float32')
    print("Compute vocab...")
    start_time = time()
    # cluster features into "vocab_size" groups
    # the function kmeans will return a 2D matrix of shape (vocab_size, 128)
    vocab = kmeans(features, vocab_size, initialization="PLUSPLUS")       
    end_time = time()
    print(f"It takes {((end_time - start_time)/60):.2f} minutes to compute vocab.")
    ##################################################################################
    #                                END OF YOUR CODE                                #
    ##################################################################################
    return vocab